X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fvulkan%2Fanv_batch_chain.c;h=9180f908379543f80a3e758e013852cfb3528abe;hb=bc9d7836bc6a448d0328f090b8d538411f8aa1a0;hp=29c79951be7939cf6cfa07cade0aa7963468d031;hpb=595400d57745fba198b42d95f3c4f5d855023c33;p=mesa.git diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index 29c79951be7..9180f908379 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -29,9 +29,10 @@ #include "anv_private.h" -#include "genxml/gen7_pack.h" #include "genxml/gen8_pack.h" +#include "util/debug.h" + /** \file anv_batch_chain.c * * This file contains functions related to anv_cmd_buffer as a data @@ -45,50 +46,59 @@ * Functions related to anv_reloc_list *-----------------------------------------------------------------------*/ +VkResult +anv_reloc_list_init(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc) +{ + memset(list, 0, sizeof(*list)); + return VK_SUCCESS; +} + static VkResult anv_reloc_list_init_clone(struct anv_reloc_list *list, const VkAllocationCallbacks *alloc, const struct anv_reloc_list *other_list) { - if (other_list) { - list->num_relocs = other_list->num_relocs; - list->array_length = other_list->array_length; - } else { - list->num_relocs = 0; - list->array_length = 256; - } - - list->relocs = - vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - - if (list->relocs == NULL) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - - list->reloc_bos = - vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + list->num_relocs = other_list->num_relocs; + list->array_length = other_list->array_length; + + if (list->num_relocs > 0) { + list->relocs = + vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (list->relocs == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - if (list->reloc_bos == NULL) { - vk_free(alloc, list->relocs); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - } + list->reloc_bos = + vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (list->reloc_bos == NULL) { + vk_free(alloc, list->relocs); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } - if (other_list) { memcpy(list->relocs, other_list->relocs, list->array_length * sizeof(*list->relocs)); memcpy(list->reloc_bos, other_list->reloc_bos, list->array_length * sizeof(*list->reloc_bos)); + } else { + list->relocs = NULL; + list->reloc_bos = NULL; } - return VK_SUCCESS; -} + list->dep_words = other_list->dep_words; -VkResult -anv_reloc_list_init(struct anv_reloc_list *list, - const VkAllocationCallbacks *alloc) -{ - return anv_reloc_list_init_clone(list, alloc, NULL); + if (list->dep_words > 0) { + list->deps = + vk_alloc(alloc, list->dep_words * sizeof(BITSET_WORD), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + memcpy(list->deps, other_list->deps, + list->dep_words * sizeof(BITSET_WORD)); + } else { + list->deps = NULL; + } + + return VK_SUCCESS; } void @@ -97,6 +107,7 @@ anv_reloc_list_finish(struct anv_reloc_list *list, { vk_free(alloc, list->relocs); vk_free(alloc, list->reloc_bos); + vk_free(alloc, list->deps); } static VkResult @@ -107,84 +118,135 @@ anv_reloc_list_grow(struct anv_reloc_list *list, if (list->num_relocs + num_additional_relocs <= list->array_length) return VK_SUCCESS; - size_t new_length = list->array_length * 2; + size_t new_length = MAX2(16, list->array_length * 2); while (new_length < list->num_relocs + num_additional_relocs) new_length *= 2; struct drm_i915_gem_relocation_entry *new_relocs = - vk_alloc(alloc, new_length * sizeof(*list->relocs), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + vk_realloc(alloc, list->relocs, + new_length * sizeof(*list->relocs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (new_relocs == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + list->relocs = new_relocs; struct anv_bo **new_reloc_bos = - vk_alloc(alloc, new_length * sizeof(*list->reloc_bos), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (new_reloc_bos == NULL) { - vk_free(alloc, new_relocs); + vk_realloc(alloc, list->reloc_bos, + new_length * sizeof(*list->reloc_bos), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (new_reloc_bos == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - } + list->reloc_bos = new_reloc_bos; + + list->array_length = new_length; - memcpy(new_relocs, list->relocs, list->num_relocs * sizeof(*list->relocs)); - memcpy(new_reloc_bos, list->reloc_bos, - list->num_relocs * sizeof(*list->reloc_bos)); + return VK_SUCCESS; +} - vk_free(alloc, list->relocs); - vk_free(alloc, list->reloc_bos); +static VkResult +anv_reloc_list_grow_deps(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc, + uint32_t min_num_words) +{ + if (min_num_words <= list->dep_words) + return VK_SUCCESS; - list->array_length = new_length; - list->relocs = new_relocs; - list->reloc_bos = new_reloc_bos; + uint32_t new_length = MAX2(32, list->dep_words * 2); + while (new_length < min_num_words) + new_length *= 2; + + BITSET_WORD *new_deps = + vk_realloc(alloc, list->deps, new_length * sizeof(BITSET_WORD), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (new_deps == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + list->deps = new_deps; + + /* Zero out the new data */ + memset(list->deps + list->dep_words, 0, + (new_length - list->dep_words) * sizeof(BITSET_WORD)); + list->dep_words = new_length; return VK_SUCCESS; } -uint64_t +#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) + +VkResult anv_reloc_list_add(struct anv_reloc_list *list, const VkAllocationCallbacks *alloc, - uint32_t offset, struct anv_bo *target_bo, uint32_t delta) + uint32_t offset, struct anv_bo *target_bo, uint32_t delta, + uint64_t *address_u64_out) { struct drm_i915_gem_relocation_entry *entry; int index; - const uint32_t domain = - target_bo->is_winsys_bo ? I915_GEM_DOMAIN_RENDER : 0; + struct anv_bo *unwrapped_target_bo = anv_bo_unwrap(target_bo); + uint64_t target_bo_offset = READ_ONCE(unwrapped_target_bo->offset); + if (address_u64_out) + *address_u64_out = target_bo_offset + delta; - anv_reloc_list_grow(list, alloc, 1); - /* TODO: Handle failure */ + if (unwrapped_target_bo->flags & EXEC_OBJECT_PINNED) { + assert(!target_bo->is_wrapper); + uint32_t idx = unwrapped_target_bo->gem_handle; + anv_reloc_list_grow_deps(list, alloc, (idx / BITSET_WORDBITS) + 1); + BITSET_SET(list->deps, unwrapped_target_bo->gem_handle); + return VK_SUCCESS; + } + + VkResult result = anv_reloc_list_grow(list, alloc, 1); + if (result != VK_SUCCESS) + return result; /* XXX: Can we use I915_EXEC_HANDLE_LUT? */ index = list->num_relocs++; list->reloc_bos[index] = target_bo; entry = &list->relocs[index]; - entry->target_handle = target_bo->gem_handle; + entry->target_handle = -1; /* See also anv_cmd_buffer_process_relocs() */ entry->delta = delta; entry->offset = offset; - entry->presumed_offset = target_bo->offset; - entry->read_domains = domain; - entry->write_domain = domain; + entry->presumed_offset = target_bo_offset; + entry->read_domains = 0; + entry->write_domain = 0; VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry))); - return target_bo->offset + delta; + return VK_SUCCESS; } static void +anv_reloc_list_clear(struct anv_reloc_list *list) +{ + list->num_relocs = 0; + if (list->dep_words > 0) + memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD)); +} + +static VkResult anv_reloc_list_append(struct anv_reloc_list *list, const VkAllocationCallbacks *alloc, struct anv_reloc_list *other, uint32_t offset) { - anv_reloc_list_grow(list, alloc, other->num_relocs); - /* TODO: Handle failure */ + VkResult result = anv_reloc_list_grow(list, alloc, other->num_relocs); + if (result != VK_SUCCESS) + return result; - memcpy(&list->relocs[list->num_relocs], &other->relocs[0], - other->num_relocs * sizeof(other->relocs[0])); - memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0], - other->num_relocs * sizeof(other->reloc_bos[0])); + if (other->num_relocs > 0) { + memcpy(&list->relocs[list->num_relocs], &other->relocs[0], + other->num_relocs * sizeof(other->relocs[0])); + memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0], + other->num_relocs * sizeof(other->reloc_bos[0])); - for (uint32_t i = 0; i < other->num_relocs; i++) - list->relocs[i + list->num_relocs].offset += offset; + for (uint32_t i = 0; i < other->num_relocs; i++) + list->relocs[i + list->num_relocs].offset += offset; - list->num_relocs += other->num_relocs; + list->num_relocs += other->num_relocs; + } + + anv_reloc_list_grow_deps(list, alloc, other->dep_words); + for (uint32_t w = 0; w < other->dep_words; w++) + list->deps[w] |= other->deps[w]; + + return VK_SUCCESS; } /*-----------------------------------------------------------------------* @@ -194,8 +256,13 @@ anv_reloc_list_append(struct anv_reloc_list *list, void * anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords) { - if (batch->next + num_dwords * 4 > batch->end) - batch->extend_cb(batch, batch->user_data); + if (batch->next + num_dwords * 4 > batch->end) { + VkResult result = batch->extend_cb(batch, batch->user_data); + if (result != VK_SUCCESS) { + anv_batch_set_error(batch, result); + return NULL; + } + } void *p = batch->next; @@ -209,8 +276,16 @@ uint64_t anv_batch_emit_reloc(struct anv_batch *batch, void *location, struct anv_bo *bo, uint32_t delta) { - return anv_reloc_list_add(batch->relocs, batch->alloc, - location - batch->start, bo, delta); + uint64_t address_u64 = 0; + VkResult result = anv_reloc_list_add(batch->relocs, batch->alloc, + location - batch->start, bo, delta, + &address_u64); + if (result != VK_SUCCESS) { + anv_batch_set_error(batch, result); + return 0; + } + + return address_u64; } void @@ -221,8 +296,13 @@ anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other) size = other->next - other->start; assert(size % 4 == 0); - if (batch->next + size > batch->end) - batch->extend_cb(batch, batch->user_data); + if (batch->next + size > batch->end) { + VkResult result = batch->extend_cb(batch, batch->user_data); + if (result != VK_SUCCESS) { + anv_batch_set_error(batch, result); + return; + } + } assert(batch->next + size <= batch->end); @@ -230,8 +310,12 @@ anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other) memcpy(batch->next, other->start, size); offset = batch->next - batch->start; - anv_reloc_list_append(batch->relocs, batch->alloc, - other->relocs, offset); + VkResult result = anv_reloc_list_append(batch->relocs, batch->alloc, + other->relocs, offset); + if (result != VK_SUCCESS) { + anv_batch_set_error(batch, result); + return; + } batch->next += size; } @@ -251,8 +335,8 @@ anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer, if (bbo == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, &bbo->bo, - ANV_CMD_BUFFER_BATCH_SIZE); + result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, + ANV_CMD_BUFFER_BATCH_SIZE, &bbo->bo); if (result != VK_SUCCESS) goto fail_alloc; @@ -265,7 +349,7 @@ anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer, return VK_SUCCESS; fail_bo_alloc: - anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); fail_alloc: vk_free(&cmd_buffer->pool->alloc, bbo); @@ -284,8 +368,8 @@ anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer, if (bbo == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, &bbo->bo, - other_bbo->bo.size); + result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, + other_bbo->bo->size, &bbo->bo); if (result != VK_SUCCESS) goto fail_alloc; @@ -295,14 +379,13 @@ anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer, goto fail_bo_alloc; bbo->length = other_bbo->length; - memcpy(bbo->bo.map, other_bbo->bo.map, other_bbo->length); - + memcpy(bbo->bo->map, other_bbo->bo->map, other_bbo->length); *bbo_out = bbo; return VK_SUCCESS; fail_bo_alloc: - anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); fail_alloc: vk_free(&cmd_buffer->pool->alloc, bbo); @@ -313,26 +396,26 @@ static void anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch, size_t batch_padding) { - batch->next = batch->start = bbo->bo.map; - batch->end = bbo->bo.map + bbo->bo.size - batch_padding; + batch->next = batch->start = bbo->bo->map; + batch->end = bbo->bo->map + bbo->bo->size - batch_padding; batch->relocs = &bbo->relocs; - bbo->relocs.num_relocs = 0; + anv_reloc_list_clear(&bbo->relocs); } static void anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch, size_t batch_padding) { - batch->start = bbo->bo.map; - batch->next = bbo->bo.map + bbo->length; - batch->end = bbo->bo.map + bbo->bo.size - batch_padding; + batch->start = bbo->bo->map; + batch->next = bbo->bo->map + bbo->length; + batch->end = bbo->bo->map + bbo->bo->size - batch_padding; batch->relocs = &bbo->relocs; } static void anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch) { - assert(batch->start == bbo->bo.map); + assert(batch->start == bbo->bo->map); bbo->length = batch->next - batch->start; VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length)); } @@ -342,25 +425,25 @@ anv_batch_bo_grow(struct anv_cmd_buffer *cmd_buffer, struct anv_batch_bo *bbo, struct anv_batch *batch, size_t aditional, size_t batch_padding) { - assert(batch->start == bbo->bo.map); + assert(batch->start == bbo->bo->map); bbo->length = batch->next - batch->start; - size_t new_size = bbo->bo.size; + size_t new_size = bbo->bo->size; while (new_size <= bbo->length + aditional + batch_padding) new_size *= 2; - if (new_size == bbo->bo.size) + if (new_size == bbo->bo->size) return VK_SUCCESS; - struct anv_bo new_bo; + struct anv_bo *new_bo; VkResult result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, - &new_bo, new_size); + new_size, &new_bo); if (result != VK_SUCCESS) return result; - memcpy(new_bo.map, bbo->bo.map, bbo->length); + memcpy(new_bo->map, bbo->bo->map, bbo->length); - anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); bbo->bo = new_bo; anv_batch_bo_continue(bbo, batch, batch_padding); @@ -368,12 +451,45 @@ anv_batch_bo_grow(struct anv_cmd_buffer *cmd_buffer, struct anv_batch_bo *bbo, return VK_SUCCESS; } +static void +anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer, + struct anv_batch_bo *prev_bbo, + struct anv_batch_bo *next_bbo, + uint32_t next_bbo_offset) +{ + const uint32_t bb_start_offset = + prev_bbo->length - GEN8_MI_BATCH_BUFFER_START_length * 4; + ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset; + + /* Make sure we're looking at a MI_BATCH_BUFFER_START */ + assert(((*bb_start >> 29) & 0x07) == 0); + assert(((*bb_start >> 23) & 0x3f) == 49); + + if (cmd_buffer->device->instance->physicalDevice.use_softpin) { + assert(prev_bbo->bo->flags & EXEC_OBJECT_PINNED); + assert(next_bbo->bo->flags & EXEC_OBJECT_PINNED); + + write_reloc(cmd_buffer->device, + prev_bbo->bo->map + bb_start_offset + 4, + next_bbo->bo->offset + next_bbo_offset, true); + } else { + uint32_t reloc_idx = prev_bbo->relocs.num_relocs - 1; + assert(prev_bbo->relocs.relocs[reloc_idx].offset == bb_start_offset + 4); + + prev_bbo->relocs.reloc_bos[reloc_idx] = next_bbo->bo; + prev_bbo->relocs.relocs[reloc_idx].delta = next_bbo_offset; + + /* Use a bogus presumed offset to force a relocation */ + prev_bbo->relocs.relocs[reloc_idx].presumed_offset = -1; + } +} + static void anv_batch_bo_destroy(struct anv_batch_bo *bbo, struct anv_cmd_buffer *cmd_buffer) { anv_reloc_list_finish(&bbo->relocs, &cmd_buffer->pool->alloc); - anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); vk_free(&cmd_buffer->pool->alloc, bbo); } @@ -394,23 +510,17 @@ anv_batch_bo_list_clone(const struct list_head *list, break; list_addtail(&new_bbo->link, new_list); - if (prev_bbo) { - /* As we clone this list of batch_bo's, they chain one to the - * other using MI_BATCH_BUFFER_START commands. We need to fix up - * those relocations as we go. Fortunately, this is pretty easy - * as it will always be the last relocation in the list. - */ - uint32_t last_idx = prev_bbo->relocs.num_relocs - 1; - assert(prev_bbo->relocs.reloc_bos[last_idx] == &bbo->bo); - prev_bbo->relocs.reloc_bos[last_idx] = &new_bbo->bo; - } + if (prev_bbo) + anv_batch_bo_link(cmd_buffer, prev_bbo, new_bbo, 0); prev_bbo = new_bbo; } if (result != VK_SUCCESS) { - list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) + list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) { + list_del(&bbo->link); anv_batch_bo_destroy(bbo, cmd_buffer); + } } return result; @@ -420,7 +530,7 @@ anv_batch_bo_list_clone(const struct list_head *list, * Functions related to anv_batch_bo *-----------------------------------------------------------------------*/ -static inline struct anv_batch_bo * +static struct anv_batch_bo * anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer) { return LIST_ENTRY(struct anv_batch_bo, cmd_buffer->batch_bos.prev, link); @@ -429,9 +539,10 @@ anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer) struct anv_address anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer) { + struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); return (struct anv_address) { - .bo = &cmd_buffer->device->surface_state_block_pool.bo, - .offset = *(int32_t *)u_vector_head(&cmd_buffer->bt_blocks), + .bo = anv_binding_table_pool(cmd_buffer->device)->block_pool.bo, + .offset = bt_block->offset, }; } @@ -447,6 +558,9 @@ emit_batch_buffer_start(struct anv_cmd_buffer *cmd_buffer, * gens. */ +#define GEN7_MI_BATCH_BUFFER_START_length 2 +#define GEN7_MI_BATCH_BUFFER_START_length_bias 2 + const uint32_t gen7_length = GEN7_MI_BATCH_BUFFER_START_length - GEN7_MI_BATCH_BUFFER_START_length_bias; const uint32_t gen8_length = @@ -455,7 +569,7 @@ emit_batch_buffer_start(struct anv_cmd_buffer *cmd_buffer, anv_batch_emit(&cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_START, bbs) { bbs.DWordLength = cmd_buffer->device->info.gen < 8 ? gen7_length : gen8_length; - bbs._2ndLevelBatchBuffer = _1stlevelbatch; + bbs.SecondLevelBatchBuffer = Firstlevelbatch; bbs.AddressSpaceIndicator = ASI_PPGTT; bbs.BatchBufferStartAddress = (struct anv_address) { bo, offset }; } @@ -474,9 +588,9 @@ cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer, * chaining command, let's set it back where it should go. */ batch->end += GEN8_MI_BATCH_BUFFER_START_length * 4; - assert(batch->end == current_bbo->bo.map + current_bbo->bo.size); + assert(batch->end == current_bbo->bo->map + current_bbo->bo->size); - emit_batch_buffer_start(cmd_buffer, &bbo->bo, 0); + emit_batch_buffer_start(cmd_buffer, bbo->bo, 0); anv_batch_bo_finish(current_bbo, batch); } @@ -594,23 +708,28 @@ struct anv_state anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer, uint32_t entries, uint32_t *state_offset) { - struct anv_block_pool *block_pool = - &cmd_buffer->device->surface_state_block_pool; - int32_t *bt_block = u_vector_head(&cmd_buffer->bt_blocks); - struct anv_state state; + struct anv_device *device = cmd_buffer->device; + struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); - state.alloc_size = align_u32(entries * 4, 32); + uint32_t bt_size = align_u32(entries * 4, 32); - if (cmd_buffer->bt_next + state.alloc_size > block_pool->block_size) + struct anv_state state = cmd_buffer->bt_next; + if (bt_size > state.alloc_size) return (struct anv_state) { 0 }; - state.offset = cmd_buffer->bt_next; - state.map = block_pool->map + *bt_block + state.offset; - - cmd_buffer->bt_next += state.alloc_size; + state.alloc_size = bt_size; + cmd_buffer->bt_next.offset += bt_size; + cmd_buffer->bt_next.map += bt_size; + cmd_buffer->bt_next.alloc_size -= bt_size; - assert(*bt_block < 0); - *state_offset = -(*bt_block); + if (device->instance->physicalDevice.use_softpin) { + assert(bt_block->offset >= 0); + *state_offset = device->surface_state_pool.block_pool.start_address - + device->binding_table_pool.block_pool.start_address - bt_block->offset; + } else { + assert(bt_block->offset < 0); + *state_offset = -bt_block->offset; + } return state; } @@ -618,7 +737,9 @@ anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer, struct anv_state anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer) { - return anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); + struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + return anv_state_stream_alloc(&cmd_buffer->surface_state_stream, + isl_dev->ss.size, isl_dev->ss.align); } struct anv_state @@ -632,31 +753,21 @@ anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer, VkResult anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer) { - struct anv_block_pool *block_pool = - &cmd_buffer->device->surface_state_block_pool; - - int32_t *offset = u_vector_add(&cmd_buffer->bt_blocks); - if (offset == NULL) + struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states); + if (bt_block == NULL) { + anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } - *offset = anv_block_pool_alloc_back(block_pool); - cmd_buffer->bt_next = 0; - - return VK_SUCCESS; -} + *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device); -static void -anv_execbuf_init(struct anv_execbuf *exec) -{ - memset(exec, 0, sizeof(*exec)); -} + /* The bt_next state is a rolling state (we update it as we suballocate + * from it) which is relative to the start of the binding table block. + */ + cmd_buffer->bt_next = *bt_block; + cmd_buffer->bt_next.offset = 0; -static void -anv_execbuf_finish(struct anv_execbuf *exec, - const VkAllocationCallbacks *alloc) -{ - vk_free(alloc, exec->objects); - vk_free(alloc, exec->bos); + return VK_SUCCESS; } VkResult @@ -693,8 +804,10 @@ anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo; - success = u_vector_init(&cmd_buffer->bt_blocks, sizeof(int32_t), - 8 * sizeof(int32_t)); + /* u_vector requires power-of-two size elements */ + unsigned pow2_state_size = util_next_power_of_two(sizeof(struct anv_state)); + success = u_vector_init(&cmd_buffer->bt_block_states, + pow2_state_size, 8 * pow2_state_size); if (!success) goto fail_seen_bbos; @@ -704,14 +817,14 @@ anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) goto fail_bt_blocks; cmd_buffer->last_ss_pool_center = 0; - anv_cmd_buffer_new_binding_table_block(cmd_buffer); - - anv_execbuf_init(&cmd_buffer->execbuf2); + result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); + if (result != VK_SUCCESS) + goto fail_bt_blocks; return VK_SUCCESS; fail_bt_blocks: - u_vector_finish(&cmd_buffer->bt_blocks); + u_vector_finish(&cmd_buffer->bt_block_states); fail_seen_bbos: u_vector_finish(&cmd_buffer->seen_bbos); fail_batch_bo: @@ -723,12 +836,10 @@ anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) void anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) { - int32_t *bt_block; - u_vector_foreach(bt_block, &cmd_buffer->bt_blocks) { - anv_block_pool_free(&cmd_buffer->device->surface_state_block_pool, - *bt_block); - } - u_vector_finish(&cmd_buffer->bt_blocks); + struct anv_state *bt_block; + u_vector_foreach(bt_block, &cmd_buffer->bt_block_states) + anv_binding_table_pool_free(cmd_buffer->device, *bt_block); + u_vector_finish(&cmd_buffer->bt_block_states); anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc); @@ -737,37 +848,35 @@ anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) /* Destroy all of the batch buffers */ list_for_each_entry_safe(struct anv_batch_bo, bbo, &cmd_buffer->batch_bos, link) { + list_del(&bbo->link); anv_batch_bo_destroy(bbo, cmd_buffer); } - - anv_execbuf_finish(&cmd_buffer->execbuf2, &cmd_buffer->pool->alloc); } void anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) { /* Delete all but the first batch bo */ - assert(!list_empty(&cmd_buffer->batch_bos)); + assert(!list_is_empty(&cmd_buffer->batch_bos)); while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) { struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer); list_del(&bbo->link); anv_batch_bo_destroy(bbo, cmd_buffer); } - assert(!list_empty(&cmd_buffer->batch_bos)); + assert(!list_is_empty(&cmd_buffer->batch_bos)); anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer), &cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_START_length * 4); - while (u_vector_length(&cmd_buffer->bt_blocks) > 1) { - int32_t *bt_block = u_vector_remove(&cmd_buffer->bt_blocks); - anv_block_pool_free(&cmd_buffer->device->surface_state_block_pool, - *bt_block); + while (u_vector_length(&cmd_buffer->bt_block_states) > 1) { + struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states); + anv_binding_table_pool_free(cmd_buffer->device, *bt_block); } - assert(u_vector_length(&cmd_buffer->bt_blocks) == 1); - cmd_buffer->bt_next = 0; + assert(u_vector_length(&cmd_buffer->bt_block_states) == 1); + cmd_buffer->bt_next = ANV_STATE_NULL; - cmd_buffer->surface_relocs.num_relocs = 0; + anv_reloc_list_clear(&cmd_buffer->surface_relocs); cmd_buffer->last_ss_pool_center = 0; /* Reset the list of seen buffers */ @@ -791,29 +900,27 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer) * with our BATCH_BUFFER_END in another BO. */ cmd_buffer->batch.end += GEN8_MI_BATCH_BUFFER_START_length * 4; - assert(cmd_buffer->batch.end == batch_bo->bo.map + batch_bo->bo.size); + assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size); - anv_batch_emit(&cmd_buffer->batch, GEN7_MI_BATCH_BUFFER_END, bbe); + anv_batch_emit(&cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_END, bbe); /* Round batch up to an even number of dwords. */ if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4) - anv_batch_emit(&cmd_buffer->batch, GEN7_MI_NOOP, noop); + anv_batch_emit(&cmd_buffer->batch, GEN8_MI_NOOP, noop); cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY; - } - - anv_batch_bo_finish(batch_bo, &cmd_buffer->batch); - - if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { + } else { + assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); /* If this is a secondary command buffer, we need to determine the * mode in which it will be executed with vkExecuteCommands. We * determine this statically here so that this stays in sync with the * actual ExecuteCommands implementation. */ + const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start; if (!cmd_buffer->device->can_chain_batches) { cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT; } else if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) && - (batch_bo->length < ANV_CMD_BUFFER_BATCH_SIZE / 2)) { + (length < ANV_CMD_BUFFER_BATCH_SIZE / 2)) { /* If the secondary has exactly one batch buffer in its list *and* * that batch buffer is less than half of the maximum size, we're * probably better of simply copying it into our batch. @@ -823,20 +930,31 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer) VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) { cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN; - /* When we chain, we need to add an MI_BATCH_BUFFER_START command - * with its relocation. In order to handle this we'll increment here - * so we can unconditionally decrement right before adding the - * MI_BATCH_BUFFER_START command. + /* In order to chain, we need this command buffer to contain an + * MI_BATCH_BUFFER_START which will jump back to the calling batch. + * It doesn't matter where it points now so long as has a valid + * relocation. We'll adjust it later as part of the chaining + * process. + * + * We set the end of the batch a little short so we would be sure we + * have room for the chaining command. Since we're about to emit the + * chaining command, let's set it back where it should go. */ - batch_bo->relocs.num_relocs++; - cmd_buffer->batch.next += GEN8_MI_BATCH_BUFFER_START_length * 4; + cmd_buffer->batch.end += GEN8_MI_BATCH_BUFFER_START_length * 4; + assert(cmd_buffer->batch.start == batch_bo->bo->map); + assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size); + + emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0); + assert(cmd_buffer->batch.start == batch_bo->bo->map); } else { cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN; } } + + anv_batch_bo_finish(batch_bo, &cmd_buffer->batch); } -static inline VkResult +static VkResult anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer, struct list_head *list) { @@ -873,38 +991,18 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, struct anv_batch_bo *last_bbo = list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link); - emit_batch_buffer_start(primary, &first_bbo->bo, 0); + emit_batch_buffer_start(primary, first_bbo->bo, 0); struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary); - assert(primary->batch.start == this_bbo->bo.map); + assert(primary->batch.start == this_bbo->bo->map); uint32_t offset = primary->batch.next - primary->batch.start; - const uint32_t inst_size = GEN8_MI_BATCH_BUFFER_START_length * 4; - /* Roll back the previous MI_BATCH_BUFFER_START and its relocation so we - * can emit a new command and relocation for the current splice. In - * order to handle the initial-use case, we incremented next and - * num_relocs in end_batch_buffer() so we can alyways just subtract - * here. + /* Make the tail of the secondary point back to right after the + * MI_BATCH_BUFFER_START in the primary batch. */ - last_bbo->relocs.num_relocs--; - secondary->batch.next -= inst_size; - emit_batch_buffer_start(secondary, &this_bbo->bo, offset); - anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos); + anv_batch_bo_link(primary, last_bbo, this_bbo, offset); - /* After patching up the secondary buffer, we need to clflush the - * modified instruction in case we're on a !llc platform. We use a - * little loop to handle the case where the instruction crosses a cache - * line boundary. - */ - if (!primary->device->info.has_llc) { - void *inst = secondary->batch.next - inst_size; - void *p = (void *) (((uintptr_t) inst) & ~CACHELINE_MASK); - __builtin_ia32_mfence(); - while (p < secondary->batch.next) { - __builtin_ia32_clflush(p); - p += CACHELINE_SIZE; - } - } + anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos); break; } case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: { @@ -938,14 +1036,53 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, &secondary->surface_relocs, 0); } +struct anv_execbuf { + struct drm_i915_gem_execbuffer2 execbuf; + + struct drm_i915_gem_exec_object2 * objects; + uint32_t bo_count; + struct anv_bo ** bos; + + /* Allocated length of the 'objects' and 'bos' arrays */ + uint32_t array_length; + + bool has_relocs; + + const VkAllocationCallbacks * alloc; + VkSystemAllocationScope alloc_scope; +}; + +static void +anv_execbuf_init(struct anv_execbuf *exec) +{ + memset(exec, 0, sizeof(*exec)); +} + +static void +anv_execbuf_finish(struct anv_execbuf *exec) +{ + vk_free(exec->alloc, exec->objects); + vk_free(exec->alloc, exec->bos); +} + static VkResult -anv_execbuf_add_bo(struct anv_execbuf *exec, +anv_execbuf_add_bo_bitset(struct anv_device *device, + struct anv_execbuf *exec, + uint32_t dep_words, + BITSET_WORD *deps, + uint32_t extra_flags); + +static VkResult +anv_execbuf_add_bo(struct anv_device *device, + struct anv_execbuf *exec, struct anv_bo *bo, struct anv_reloc_list *relocs, - const VkAllocationCallbacks *alloc) + uint32_t extra_flags) { struct drm_i915_gem_exec_object2 *obj = NULL; + bo = anv_bo_unwrap(bo); + if (bo->index < exec->bo_count && exec->bos[bo->index] == bo) obj = &exec->objects[bo->index]; @@ -957,16 +1094,14 @@ anv_execbuf_add_bo(struct anv_execbuf *exec, uint32_t new_len = exec->objects ? exec->array_length * 2 : 64; struct drm_i915_gem_exec_object2 *new_objects = - vk_alloc(alloc, new_len * sizeof(*new_objects), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + vk_alloc(exec->alloc, new_len * sizeof(*new_objects), 8, exec->alloc_scope); if (new_objects == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); struct anv_bo **new_bos = - vk_alloc(alloc, new_len * sizeof(*new_bos), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + vk_alloc(exec->alloc, new_len * sizeof(*new_bos), 8, exec->alloc_scope); if (new_bos == NULL) { - vk_free(alloc, new_objects); + vk_free(exec->alloc, new_objects); return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } @@ -977,8 +1112,8 @@ anv_execbuf_add_bo(struct anv_execbuf *exec, exec->bo_count * sizeof(*new_bos)); } - vk_free(alloc, exec->objects); - vk_free(alloc, exec->bos); + vk_free(exec->alloc, exec->objects); + vk_free(exec->alloc, exec->bos); exec->objects = new_objects; exec->bos = new_bos; @@ -996,70 +1131,82 @@ anv_execbuf_add_bo(struct anv_execbuf *exec, obj->relocs_ptr = 0; obj->alignment = 0; obj->offset = bo->offset; - obj->flags = bo->is_winsys_bo ? EXEC_OBJECT_WRITE : 0; + obj->flags = bo->flags | extra_flags; obj->rsvd1 = 0; obj->rsvd2 = 0; } - if (relocs != NULL && obj->relocation_count == 0) { - /* This is the first time we've ever seen a list of relocations for - * this BO. Go ahead and set the relocations and then walk the list - * of relocations and add them all. - */ - obj->relocation_count = relocs->num_relocs; - obj->relocs_ptr = (uintptr_t) relocs->relocs; + if (relocs != NULL) { + assert(obj->relocation_count == 0); - for (size_t i = 0; i < relocs->num_relocs; i++) { - /* A quick sanity check on relocations */ - assert(relocs->relocs[i].offset < bo->size); - anv_execbuf_add_bo(exec, relocs->reloc_bos[i], NULL, alloc); + if (relocs->num_relocs > 0) { + /* This is the first time we've ever seen a list of relocations for + * this BO. Go ahead and set the relocations and then walk the list + * of relocations and add them all. + */ + exec->has_relocs = true; + obj->relocation_count = relocs->num_relocs; + obj->relocs_ptr = (uintptr_t) relocs->relocs; + + for (size_t i = 0; i < relocs->num_relocs; i++) { + VkResult result; + + /* A quick sanity check on relocations */ + assert(relocs->relocs[i].offset < bo->size); + result = anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i], + NULL, extra_flags); + if (result != VK_SUCCESS) + return result; + } } + + return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words, + relocs->deps, extra_flags); } return VK_SUCCESS; } -static void -anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer, - struct anv_reloc_list *list) +/* Add BO dependencies to execbuf */ +static VkResult +anv_execbuf_add_bo_bitset(struct anv_device *device, + struct anv_execbuf *exec, + uint32_t dep_words, + BITSET_WORD *deps, + uint32_t extra_flags) { - for (size_t i = 0; i < list->num_relocs; i++) - list->relocs[i].target_handle = list->reloc_bos[i]->index; + for (uint32_t w = 0; w < dep_words; w++) { + BITSET_WORD mask = deps[w]; + while (mask) { + int i = u_bit_scan(&mask); + uint32_t gem_handle = w * BITSET_WORDBITS + i; + struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle); + assert(bo->refcount > 0); + VkResult result = + anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags); + if (result != VK_SUCCESS) + return result; + } + } + + return VK_SUCCESS; } static void -write_reloc(const struct anv_device *device, void *p, uint64_t v, bool flush) +anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer, + struct anv_reloc_list *list) { - unsigned reloc_size = 0; - if (device->info.gen >= 8) { - /* From the Broadwell PRM Vol. 2a, MI_LOAD_REGISTER_MEM::MemoryAddress: - * - * "This field specifies the address of the memory location where the - * register value specified in the DWord above will read from. The - * address specifies the DWord location of the data. Range = - * GraphicsVirtualAddress[63:2] for a DWord register GraphicsAddress - * [63:48] are ignored by the HW and assumed to be in correct - * canonical form [63:48] == [47]." - */ - const int shift = 63 - 47; - reloc_size = sizeof(uint64_t); - *(uint64_t *)p = (((int64_t)v) << shift) >> shift; - } else { - reloc_size = sizeof(uint32_t); - *(uint32_t *)p = v; - } - - if (flush && !device->info.has_llc) - anv_clflush_range(p, reloc_size); + for (size_t i = 0; i < list->num_relocs; i++) + list->relocs[i].target_handle = anv_bo_unwrap(list->reloc_bos[i])->index; } static void -adjust_relocations_from_state_pool(struct anv_block_pool *pool, +adjust_relocations_from_state_pool(struct anv_state_pool *pool, struct anv_reloc_list *relocs, uint32_t last_pool_center_bo_offset) { - assert(last_pool_center_bo_offset <= pool->center_bo_offset); - uint32_t delta = pool->center_bo_offset - last_pool_center_bo_offset; + assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset); + uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset; for (size_t i = 0; i < relocs->num_relocs; i++) { /* All of the relocations from this block pool to other BO's should @@ -1072,13 +1219,14 @@ adjust_relocations_from_state_pool(struct anv_block_pool *pool, } static void -adjust_relocations_to_state_pool(struct anv_block_pool *pool, +adjust_relocations_to_state_pool(struct anv_state_pool *pool, struct anv_bo *from_bo, struct anv_reloc_list *relocs, uint32_t last_pool_center_bo_offset) { - assert(last_pool_center_bo_offset <= pool->center_bo_offset); - uint32_t delta = pool->center_bo_offset - last_pool_center_bo_offset; + assert(!from_bo->is_wrapper); + assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset); + uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset; /* When we initially emit relocations into a block pool, we don't * actually know what the final center_bo_offset will be so we just emit @@ -1087,7 +1235,7 @@ adjust_relocations_to_state_pool(struct anv_block_pool *pool, * relocations that point to the pool bo with the correct offset. */ for (size_t i = 0; i < relocs->num_relocs; i++) { - if (relocs->reloc_bos[i] == &pool->bo) { + if (relocs->reloc_bos[i] == pool->block_pool.bo) { /* Adjust the delta value in the relocation to correctly * correspond to the new delta. Initially, this value may have * been negative (if treated as unsigned), but we trust in @@ -1101,46 +1249,212 @@ adjust_relocations_to_state_pool(struct anv_block_pool *pool, * use by the GPU at the moment. */ assert(relocs->relocs[i].offset < from_bo->size); - write_reloc(pool->device, from_bo->map + relocs->relocs[i].offset, + write_reloc(pool->block_pool.device, + from_bo->map + relocs->relocs[i].offset, relocs->relocs[i].presumed_offset + relocs->relocs[i].delta, false); } } } -void -anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer) +static void +anv_reloc_list_apply(struct anv_device *device, + struct anv_reloc_list *list, + struct anv_bo *bo, + bool always_relocate) { - struct anv_batch *batch = &cmd_buffer->batch; - struct anv_block_pool *ss_pool = - &cmd_buffer->device->surface_state_block_pool; + bo = anv_bo_unwrap(bo); + + for (size_t i = 0; i < list->num_relocs; i++) { + struct anv_bo *target_bo = anv_bo_unwrap(list->reloc_bos[i]); + if (list->relocs[i].presumed_offset == target_bo->offset && + !always_relocate) + continue; + + void *p = bo->map + list->relocs[i].offset; + write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true); + list->relocs[i].presumed_offset = target_bo->offset; + } +} + +/** + * This function applies the relocation for a command buffer and writes the + * actual addresses into the buffers as per what we were told by the kernel on + * the previous execbuf2 call. This should be safe to do because, for each + * relocated address, we have two cases: + * + * 1) The target BO is inactive (as seen by the kernel). In this case, it is + * not in use by the GPU so updating the address is 100% ok. It won't be + * in-use by the GPU (from our context) again until the next execbuf2 + * happens. If the kernel decides to move it in the next execbuf2, it + * will have to do the relocations itself, but that's ok because it should + * have all of the information needed to do so. + * + * 2) The target BO is active (as seen by the kernel). In this case, it + * hasn't moved since the last execbuffer2 call because GTT shuffling + * *only* happens when the BO is idle. (From our perspective, it only + * happens inside the execbuffer2 ioctl, but the shuffling may be + * triggered by another ioctl, with full-ppgtt this is limited to only + * execbuffer2 ioctls on the same context, or memory pressure.) Since the + * target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT + * address and the relocated value we are writing into the BO will be the + * same as the value that is already there. + * + * There is also a possibility that the target BO is active but the exact + * RENDER_SURFACE_STATE object we are writing the relocation into isn't in + * use. In this case, the address currently in the RENDER_SURFACE_STATE + * may be stale but it's still safe to write the relocation because that + * particular RENDER_SURFACE_STATE object isn't in-use by the GPU and + * won't be until the next execbuf2 call. + * + * By doing relocations on the CPU, we can tell the kernel that it doesn't + * need to bother. We want to do this because the surface state buffer is + * used by every command buffer so, if the kernel does the relocations, it + * will always be busy and the kernel will always stall. This is also + * probably the fastest mechanism for doing relocations since the kernel would + * have to make a full copy of all the relocations lists. + */ +static bool +relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, + struct anv_execbuf *exec) +{ + if (!exec->has_relocs) + return true; + + static int userspace_relocs = -1; + if (userspace_relocs < 0) + userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true); + if (!userspace_relocs) + return false; + + /* First, we have to check to see whether or not we can even do the + * relocation. New buffers which have never been submitted to the kernel + * don't have a valid offset so we need to let the kernel do relocations so + * that we can get offsets for them. On future execbuf2 calls, those + * buffers will have offsets and we will be able to skip relocating. + * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1. + */ + for (uint32_t i = 0; i < exec->bo_count; i++) { + assert(!exec->bos[i]->is_wrapper); + if (exec->bos[i]->offset == (uint64_t)-1) + return false; + } + + /* Since surface states are shared between command buffers and we don't + * know what order they will be submitted to the kernel, we don't know + * what address is actually written in the surface state object at any + * given time. The only option is to always relocate them. + */ + struct anv_bo *surface_state_bo = + anv_bo_unwrap(cmd_buffer->device->surface_state_pool.block_pool.bo); + anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs, + surface_state_bo, + true /* always relocate surface states */); + + /* Since we own all of the batch buffers, we know what values are stored + * in the relocated addresses and only have to update them if the offsets + * have changed. + */ + struct anv_batch_bo **bbo; + u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { + anv_reloc_list_apply(cmd_buffer->device, + &(*bbo)->relocs, (*bbo)->bo, false); + } - cmd_buffer->execbuf2.bo_count = 0; + for (uint32_t i = 0; i < exec->bo_count; i++) + exec->objects[i].offset = exec->bos[i]->offset; + + return true; +} + +static VkResult +setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, + struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_batch *batch = &cmd_buffer->batch; + struct anv_state_pool *ss_pool = + &cmd_buffer->device->surface_state_pool; adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs, cmd_buffer->last_ss_pool_center); + VkResult result; + if (cmd_buffer->device->instance->physicalDevice.use_softpin) { + anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) { + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + /* Add surface dependencies (BOs) to the execbuf */ + anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf, + cmd_buffer->surface_relocs.dep_words, + cmd_buffer->surface_relocs.deps, 0); + + /* Add the BOs for all memory objects */ + list_for_each_entry(struct anv_device_memory, mem, + &cmd_buffer->device->memory_objects, link) { + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + mem->bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + struct anv_block_pool *pool; + pool = &cmd_buffer->device->dynamic_state_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } - anv_execbuf_add_bo(&cmd_buffer->execbuf2, &ss_pool->bo, - &cmd_buffer->surface_relocs, - &cmd_buffer->pool->alloc); + pool = &cmd_buffer->device->instruction_state_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + pool = &cmd_buffer->device->binding_table_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + } else { + /* Since we aren't in the softpin case, all of our STATE_BASE_ADDRESS BOs + * will get added automatically by processing relocations on the batch + * buffer. We have to add the surface state BO manually because it has + * relocations of its own that we need to be sure are processsed. + */ + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + ss_pool->block_pool.bo, + &cmd_buffer->surface_relocs, 0); + if (result != VK_SUCCESS) + return result; + } /* First, we walk over all of the bos we've seen and add them and their * relocations to the validate list. */ struct anv_batch_bo **bbo; u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { - adjust_relocations_to_state_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs, + adjust_relocations_to_state_pool(ss_pool, (*bbo)->bo, &(*bbo)->relocs, cmd_buffer->last_ss_pool_center); - anv_execbuf_add_bo(&cmd_buffer->execbuf2, &(*bbo)->bo, &(*bbo)->relocs, - &cmd_buffer->pool->alloc); + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + (*bbo)->bo, &(*bbo)->relocs, 0); + if (result != VK_SUCCESS) + return result; } /* Now that we've adjusted all of the surface state relocations, we need to * record the surface state pool center so future executions of the command * buffer can adjust correctly. */ - cmd_buffer->last_ss_pool_center = ss_pool->center_bo_offset; + cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset; struct anv_batch_bo *first_batch_bo = list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link); @@ -1150,69 +1464,249 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer) * corresponding to the first batch_bo in the chain with the last * element in the list. */ - if (first_batch_bo->bo.index != cmd_buffer->execbuf2.bo_count - 1) { - uint32_t idx = first_batch_bo->bo.index; - uint32_t last_idx = cmd_buffer->execbuf2.bo_count - 1; + if (first_batch_bo->bo->index != execbuf->bo_count - 1) { + uint32_t idx = first_batch_bo->bo->index; + uint32_t last_idx = execbuf->bo_count - 1; - struct drm_i915_gem_exec_object2 tmp_obj = - cmd_buffer->execbuf2.objects[idx]; - assert(cmd_buffer->execbuf2.bos[idx] == &first_batch_bo->bo); + struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx]; + assert(execbuf->bos[idx] == first_batch_bo->bo); - cmd_buffer->execbuf2.objects[idx] = cmd_buffer->execbuf2.objects[last_idx]; - cmd_buffer->execbuf2.bos[idx] = cmd_buffer->execbuf2.bos[last_idx]; - cmd_buffer->execbuf2.bos[idx]->index = idx; + execbuf->objects[idx] = execbuf->objects[last_idx]; + execbuf->bos[idx] = execbuf->bos[last_idx]; + execbuf->bos[idx]->index = idx; - cmd_buffer->execbuf2.objects[last_idx] = tmp_obj; - cmd_buffer->execbuf2.bos[last_idx] = &first_batch_bo->bo; - first_batch_bo->bo.index = last_idx; + execbuf->objects[last_idx] = tmp_obj; + execbuf->bos[last_idx] = first_batch_bo->bo; + first_batch_bo->bo->index = last_idx; } + /* If we are pinning our BOs, we shouldn't have to relocate anything */ + if (cmd_buffer->device->instance->physicalDevice.use_softpin) + assert(!execbuf->has_relocs); + /* Now we go through and fixup all of the relocation lists to point to * the correct indices in the object array. We have to do this after we * reorder the list above as some of the indices may have changed. */ - u_vector_foreach(bbo, &cmd_buffer->seen_bbos) - anv_cmd_buffer_process_relocs(cmd_buffer, &(*bbo)->relocs); + if (execbuf->has_relocs) { + u_vector_foreach(bbo, &cmd_buffer->seen_bbos) + anv_cmd_buffer_process_relocs(cmd_buffer, &(*bbo)->relocs); - anv_cmd_buffer_process_relocs(cmd_buffer, &cmd_buffer->surface_relocs); + anv_cmd_buffer_process_relocs(cmd_buffer, &cmd_buffer->surface_relocs); + } if (!cmd_buffer->device->info.has_llc) { __builtin_ia32_mfence(); u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE) - __builtin_ia32_clflush((*bbo)->bo.map + i); + __builtin_ia32_clflush((*bbo)->bo->map + i); } } - cmd_buffer->execbuf2.execbuf = (struct drm_i915_gem_execbuffer2) { - .buffers_ptr = (uintptr_t) cmd_buffer->execbuf2.objects, - .buffer_count = cmd_buffer->execbuf2.bo_count, + execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf->objects, + .buffer_count = execbuf->bo_count, .batch_start_offset = 0, .batch_len = batch->next - batch->start, .cliprects_ptr = 0, .num_cliprects = 0, .DR1 = 0, .DR4 = 0, - .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER | - I915_EXEC_CONSTANTS_REL_GENERAL, + .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER, .rsvd1 = cmd_buffer->device->context_id, .rsvd2 = 0, }; + + if (relocate_cmd_buffer(cmd_buffer, execbuf)) { + /* If we were able to successfully relocate everything, tell the kernel + * that it can skip doing relocations. The requirement for using + * NO_RELOC is: + * + * 1) The addresses written in the objects must match the corresponding + * reloc.presumed_offset which in turn must match the corresponding + * execobject.offset. + * + * 2) To avoid stalling, execobject.offset should match the current + * address of that object within the active context. + * + * In order to satisfy all of the invariants that make userspace + * relocations to be safe (see relocate_cmd_buffer()), we need to + * further ensure that the addresses we use match those used by the + * kernel for the most recent execbuf2. + * + * The kernel may still choose to do relocations anyway if something has + * moved in the GTT. In this case, the relocation list still needs to be + * valid. All relocations on the batch buffers are already valid and + * kept up-to-date. For surface state relocations, by applying the + * relocations in relocate_cmd_buffer, we ensured that the address in + * the RENDER_SURFACE_STATE matches presumed_offset, so it should be + * safe for the kernel to relocate them as needed. + */ + execbuf->execbuf.flags |= I915_EXEC_NO_RELOC; + } else { + /* In the case where we fall back to doing kernel relocations, we need + * to ensure that the relocation list is valid. All relocations on the + * batch buffers are already valid and kept up-to-date. Since surface + * states are shared between command buffers and we don't know what + * order they will be submitted to the kernel, we don't know what + * address is actually written in the surface state object at any given + * time. The only option is to set a bogus presumed offset and let the + * kernel relocate them. + */ + for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++) + cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1; + } + + return VK_SUCCESS; +} + +static VkResult +setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_device *device) +{ + VkResult result = anv_execbuf_add_bo(device, execbuf, + device->trivial_batch_bo, + NULL, 0); + if (result != VK_SUCCESS) + return result; + + execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf->objects, + .buffer_count = execbuf->bo_count, + .batch_start_offset = 0, + .batch_len = 8, /* GEN7_MI_BATCH_BUFFER_END and NOOP */ + .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER, + .rsvd1 = device->context_id, + .rsvd2 = 0, + }; + + return VK_SUCCESS; } +/* We lock around execbuf for three main reasons: + * + * 1) When a block pool is resized, we create a new gem handle with a + * different size and, in the case of surface states, possibly a different + * center offset but we re-use the same anv_bo struct when we do so. If + * this happens in the middle of setting up an execbuf, we could end up + * with our list of BOs out of sync with our list of gem handles. + * + * 2) The algorithm we use for building the list of unique buffers isn't + * thread-safe. While the client is supposed to syncronize around + * QueueSubmit, this would be extremely difficult to debug if it ever came + * up in the wild due to a broken app. It's better to play it safe and + * just lock around QueueSubmit. + * + * 3) The anv_cmd_buffer_execbuf function may perform relocations in + * userspace. Due to the fact that the surface state buffer is shared + * between batches, we can't afford to have that happen from multiple + * threads at the same time. Even though the user is supposed to ensure + * this doesn't happen, we play it safe as in (2) above. + * + * Since the only other things that ever take the device lock such as block + * pool resize only rarely happen, this will almost never be contended so + * taking a lock isn't really an expensive operation in this case. + */ VkResult -anv_cmd_buffer_execbuf(struct anv_device *device, - struct anv_cmd_buffer *cmd_buffer) +anv_queue_execbuf_locked(struct anv_queue *queue, + struct anv_queue_submit *submit) { - /* Since surface states are shared between command buffers and we don't - * know what order they will be submitted to the kernel, we don't know what - * address is actually written in the surface state object at any given - * time. The only option is to set a bogus presumed offset and let - * relocations do their job. - */ - for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++) - cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1; + struct anv_device *device = queue->device; + struct anv_execbuf execbuf; + anv_execbuf_init(&execbuf); + execbuf.alloc = submit->alloc; + execbuf.alloc_scope = submit->alloc_scope; - return anv_device_execbuf(device, &cmd_buffer->execbuf2.execbuf, - cmd_buffer->execbuf2.bos); + VkResult result; + + for (uint32_t i = 0; i < submit->fence_bo_count; i++) { + int signaled; + struct anv_bo *bo = anv_unpack_ptr(submit->fence_bos[i], 1, &signaled); + + result = anv_execbuf_add_bo(device, &execbuf, bo, NULL, + signaled ? EXEC_OBJECT_WRITE : 0); + if (result != VK_SUCCESS) + goto error; + } + + if (submit->cmd_buffer) { + result = setup_execbuf_for_cmd_buffer(&execbuf, submit->cmd_buffer); + } else if (submit->simple_bo) { + result = anv_execbuf_add_bo(device, &execbuf, submit->simple_bo, NULL, 0); + if (result != VK_SUCCESS) + goto error; + + execbuf.execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf.objects, + .buffer_count = execbuf.bo_count, + .batch_start_offset = 0, + .batch_len = submit->simple_bo_size, + .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER, + .rsvd1 = device->context_id, + .rsvd2 = 0, + }; + } else { + result = setup_empty_execbuf(&execbuf, queue->device); + } + + if (result != VK_SUCCESS) + goto error; + + if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) { + if (submit->cmd_buffer) { + struct anv_batch_bo **bo = u_vector_tail(&submit->cmd_buffer->seen_bbos); + + device->cmd_buffer_being_decoded = submit->cmd_buffer; + gen_print_batch(&device->decoder_ctx, (*bo)->bo->map, + (*bo)->bo->size, (*bo)->bo->offset, false); + device->cmd_buffer_being_decoded = NULL; + } else if (submit->simple_bo) { + gen_print_batch(&device->decoder_ctx, submit->simple_bo->map, + submit->simple_bo->size, submit->simple_bo->offset, false); + } else { + gen_print_batch(&device->decoder_ctx, + device->trivial_batch_bo->map, + device->trivial_batch_bo->size, + device->trivial_batch_bo->offset, false); + } + } + + if (submit->fence_count > 0) { + assert(device->instance->physicalDevice.has_syncobj); + execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY; + execbuf.execbuf.num_cliprects = submit->fence_count; + execbuf.execbuf.cliprects_ptr = (uintptr_t)submit->fences; + } + + if (submit->in_fence != -1) { + execbuf.execbuf.flags |= I915_EXEC_FENCE_IN; + execbuf.execbuf.rsvd2 |= (uint32_t)submit->in_fence; + } + + if (submit->need_out_fence) + execbuf.execbuf.flags |= I915_EXEC_FENCE_OUT; + + int ret = queue->device->no_hw ? 0 : + anv_gem_execbuffer(queue->device, &execbuf.execbuf); + if (ret) { + result = anv_queue_set_lost(queue, + "execbuf2 failed: %s", + strerror(ret)); + } + + struct drm_i915_gem_exec_object2 *objects = execbuf.objects; + for (uint32_t k = 0; k < execbuf.bo_count; k++) { + if (execbuf.bos[k]->flags & EXEC_OBJECT_PINNED) + assert(execbuf.bos[k]->offset == objects[k].offset); + execbuf.bos[k]->offset = objects[k].offset; + } + + if (result == VK_SUCCESS && submit->need_out_fence) + submit->out_fence = execbuf.execbuf.rsvd2 >> 32; + + error: + pthread_cond_broadcast(&device->queue_submit); + + anv_execbuf_finish(&execbuf); + + return result; }