X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fvulkan%2Fanv_batch_chain.c;h=a1fb8bf731ae3404a3a816ccafde02d36ec6bb8d;hb=f270a0973741724d5bdddd30e4b241caa12a1c29;hp=325da83324e659f095121c00079ce8d33bf06f07;hpb=db9f4b2a2bbf1aff3c6c878735495fc7accbb11e;p=mesa.git diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index 325da83324e..a1fb8bf731a 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -29,9 +29,10 @@ #include "anv_private.h" -#include "genxml/gen7_pack.h" #include "genxml/gen8_pack.h" +#include "util/debug.h" + /** \file anv_batch_chain.c * * This file contains functions related to anv_cmd_buffer as a data @@ -139,7 +140,7 @@ anv_reloc_list_grow(struct anv_reloc_list *list, return VK_SUCCESS; } -uint64_t +VkResult anv_reloc_list_add(struct anv_reloc_list *list, const VkAllocationCallbacks *alloc, uint32_t offset, struct anv_bo *target_bo, uint32_t delta) @@ -147,11 +148,9 @@ anv_reloc_list_add(struct anv_reloc_list *list, struct drm_i915_gem_relocation_entry *entry; int index; - const uint32_t domain = - target_bo->is_winsys_bo ? I915_GEM_DOMAIN_RENDER : 0; - - anv_reloc_list_grow(list, alloc, 1); - /* TODO: Handle failure */ + VkResult result = anv_reloc_list_grow(list, alloc, 1); + if (result != VK_SUCCESS) + return result; /* XXX: Can we use I915_EXEC_HANDLE_LUT? */ index = list->num_relocs++; @@ -161,20 +160,21 @@ anv_reloc_list_add(struct anv_reloc_list *list, entry->delta = delta; entry->offset = offset; entry->presumed_offset = target_bo->offset; - entry->read_domains = domain; - entry->write_domain = domain; + entry->read_domains = 0; + entry->write_domain = 0; VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry))); - return target_bo->offset + delta; + return VK_SUCCESS; } -static void +static VkResult anv_reloc_list_append(struct anv_reloc_list *list, const VkAllocationCallbacks *alloc, struct anv_reloc_list *other, uint32_t offset) { - anv_reloc_list_grow(list, alloc, other->num_relocs); - /* TODO: Handle failure */ + VkResult result = anv_reloc_list_grow(list, alloc, other->num_relocs); + if (result != VK_SUCCESS) + return result; memcpy(&list->relocs[list->num_relocs], &other->relocs[0], other->num_relocs * sizeof(other->relocs[0])); @@ -185,6 +185,7 @@ anv_reloc_list_append(struct anv_reloc_list *list, list->relocs[i + list->num_relocs].offset += offset; list->num_relocs += other->num_relocs; + return VK_SUCCESS; } /*-----------------------------------------------------------------------* @@ -194,8 +195,13 @@ anv_reloc_list_append(struct anv_reloc_list *list, void * anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords) { - if (batch->next + num_dwords * 4 > batch->end) - batch->extend_cb(batch, batch->user_data); + if (batch->next + num_dwords * 4 > batch->end) { + VkResult result = batch->extend_cb(batch, batch->user_data); + if (result != VK_SUCCESS) { + anv_batch_set_error(batch, result); + return NULL; + } + } void *p = batch->next; @@ -209,8 +215,14 @@ uint64_t anv_batch_emit_reloc(struct anv_batch *batch, void *location, struct anv_bo *bo, uint32_t delta) { - return anv_reloc_list_add(batch->relocs, batch->alloc, - location - batch->start, bo, delta); + VkResult result = anv_reloc_list_add(batch->relocs, batch->alloc, + location - batch->start, bo, delta); + if (result != VK_SUCCESS) { + anv_batch_set_error(batch, result); + return 0; + } + + return bo->offset + delta; } void @@ -221,8 +233,13 @@ anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other) size = other->next - other->start; assert(size % 4 == 0); - if (batch->next + size > batch->end) - batch->extend_cb(batch, batch->user_data); + if (batch->next + size > batch->end) { + VkResult result = batch->extend_cb(batch, batch->user_data); + if (result != VK_SUCCESS) { + anv_batch_set_error(batch, result); + return; + } + } assert(batch->next + size <= batch->end); @@ -230,8 +247,12 @@ anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other) memcpy(batch->next, other->start, size); offset = batch->next - batch->start; - anv_reloc_list_append(batch->relocs, batch->alloc, - other->relocs, offset); + VkResult result = anv_reloc_list_append(batch->relocs, batch->alloc, + other->relocs, offset); + if (result != VK_SUCCESS) { + anv_batch_set_error(batch, result); + return; + } batch->next += size; } @@ -297,8 +318,6 @@ anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer, bbo->length = other_bbo->length; memcpy(bbo->bo.map, other_bbo->bo.map, other_bbo->length); - bbo->last_ss_pool_bo_offset = other_bbo->last_ss_pool_bo_offset; - *bbo_out = bbo; return VK_SUCCESS; @@ -318,7 +337,6 @@ anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch, batch->next = batch->start = bbo->bo.map; batch->end = bbo->bo.map + bbo->bo.size - batch_padding; batch->relocs = &bbo->relocs; - bbo->last_ss_pool_bo_offset = 0; bbo->relocs.num_relocs = 0; } @@ -423,7 +441,7 @@ anv_batch_bo_list_clone(const struct list_head *list, * Functions related to anv_batch_bo *-----------------------------------------------------------------------*/ -static inline struct anv_batch_bo * +static struct anv_batch_bo * anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer) { return LIST_ENTRY(struct anv_batch_bo, cmd_buffer->batch_bos.prev, link); @@ -432,9 +450,10 @@ anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer) struct anv_address anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer) { + struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); return (struct anv_address) { - .bo = &cmd_buffer->device->surface_state_block_pool.bo, - .offset = *(int32_t *)u_vector_head(&cmd_buffer->bt_blocks), + .bo = &cmd_buffer->device->surface_state_pool.block_pool.bo, + .offset = bt_block->offset, }; } @@ -450,6 +469,9 @@ emit_batch_buffer_start(struct anv_cmd_buffer *cmd_buffer, * gens. */ +#define GEN7_MI_BATCH_BUFFER_START_length 2 +#define GEN7_MI_BATCH_BUFFER_START_length_bias 2 + const uint32_t gen7_length = GEN7_MI_BATCH_BUFFER_START_length - GEN7_MI_BATCH_BUFFER_START_length_bias; const uint32_t gen8_length = @@ -597,23 +619,22 @@ struct anv_state anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer, uint32_t entries, uint32_t *state_offset) { - struct anv_block_pool *block_pool = - &cmd_buffer->device->surface_state_block_pool; - int32_t *bt_block = u_vector_head(&cmd_buffer->bt_blocks); + struct anv_state_pool *state_pool = &cmd_buffer->device->surface_state_pool; + struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); struct anv_state state; state.alloc_size = align_u32(entries * 4, 32); - if (cmd_buffer->bt_next + state.alloc_size > block_pool->block_size) + if (cmd_buffer->bt_next + state.alloc_size > state_pool->block_size) return (struct anv_state) { 0 }; state.offset = cmd_buffer->bt_next; - state.map = block_pool->map + *bt_block + state.offset; + state.map = state_pool->block_pool.map + bt_block->offset + state.offset; cmd_buffer->bt_next += state.alloc_size; - assert(*bt_block < 0); - *state_offset = -(*bt_block); + assert(bt_block->offset < 0); + *state_offset = -bt_block->offset; return state; } @@ -621,7 +642,9 @@ anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer, struct anv_state anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer) { - return anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); + struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + return anv_state_stream_alloc(&cmd_buffer->surface_state_stream, + isl_dev->ss.size, isl_dev->ss.align); } struct anv_state @@ -635,14 +658,15 @@ anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer, VkResult anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer) { - struct anv_block_pool *block_pool = - &cmd_buffer->device->surface_state_block_pool; + struct anv_state_pool *state_pool = &cmd_buffer->device->surface_state_pool; - int32_t *offset = u_vector_add(&cmd_buffer->bt_blocks); - if (offset == NULL) + struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states); + if (bt_block == NULL) { + anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } - *offset = anv_block_pool_alloc_back(block_pool); + *bt_block = anv_state_pool_alloc_back(state_pool); cmd_buffer->bt_next = 0; return VK_SUCCESS; @@ -682,8 +706,10 @@ anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo; - success = u_vector_init(&cmd_buffer->bt_blocks, sizeof(int32_t), - 8 * sizeof(int32_t)); + /* u_vector requires power-of-two size elements */ + unsigned pow2_state_size = util_next_power_of_two(sizeof(struct anv_state)); + success = u_vector_init(&cmd_buffer->bt_block_states, + pow2_state_size, 8 * pow2_state_size); if (!success) goto fail_seen_bbos; @@ -691,17 +717,16 @@ anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) &cmd_buffer->pool->alloc); if (result != VK_SUCCESS) goto fail_bt_blocks; + cmd_buffer->last_ss_pool_center = 0; - anv_cmd_buffer_new_binding_table_block(cmd_buffer); - - cmd_buffer->execbuf2.objects = NULL; - cmd_buffer->execbuf2.bos = NULL; - cmd_buffer->execbuf2.array_length = 0; + result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); + if (result != VK_SUCCESS) + goto fail_bt_blocks; return VK_SUCCESS; fail_bt_blocks: - u_vector_finish(&cmd_buffer->bt_blocks); + u_vector_finish(&cmd_buffer->bt_block_states); fail_seen_bbos: u_vector_finish(&cmd_buffer->seen_bbos); fail_batch_bo: @@ -713,12 +738,10 @@ anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) void anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) { - int32_t *bt_block; - u_vector_foreach(bt_block, &cmd_buffer->bt_blocks) { - anv_block_pool_free(&cmd_buffer->device->surface_state_block_pool, - *bt_block); - } - u_vector_finish(&cmd_buffer->bt_blocks); + struct anv_state *bt_block; + u_vector_foreach(bt_block, &cmd_buffer->bt_block_states) + anv_state_pool_free(&cmd_buffer->device->surface_state_pool, *bt_block); + u_vector_finish(&cmd_buffer->bt_block_states); anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc); @@ -729,9 +752,6 @@ anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) &cmd_buffer->batch_bos, link) { anv_batch_bo_destroy(bbo, cmd_buffer); } - - vk_free(&cmd_buffer->pool->alloc, cmd_buffer->execbuf2.objects); - vk_free(&cmd_buffer->pool->alloc, cmd_buffer->execbuf2.bos); } void @@ -750,15 +770,15 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) &cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_START_length * 4); - while (u_vector_length(&cmd_buffer->bt_blocks) > 1) { - int32_t *bt_block = u_vector_remove(&cmd_buffer->bt_blocks); - anv_block_pool_free(&cmd_buffer->device->surface_state_block_pool, - *bt_block); + while (u_vector_length(&cmd_buffer->bt_block_states) > 1) { + struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states); + anv_state_pool_free(&cmd_buffer->device->surface_state_pool, *bt_block); } - assert(u_vector_length(&cmd_buffer->bt_blocks) == 1); + assert(u_vector_length(&cmd_buffer->bt_block_states) == 1); cmd_buffer->bt_next = 0; cmd_buffer->surface_relocs.num_relocs = 0; + cmd_buffer->last_ss_pool_center = 0; /* Reset the list of seen buffers */ cmd_buffer->seen_bbos.head = 0; @@ -783,11 +803,11 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->batch.end += GEN8_MI_BATCH_BUFFER_START_length * 4; assert(cmd_buffer->batch.end == batch_bo->bo.map + batch_bo->bo.size); - anv_batch_emit(&cmd_buffer->batch, GEN7_MI_BATCH_BUFFER_END, bbe); + anv_batch_emit(&cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_END, bbe); /* Round batch up to an even number of dwords. */ if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4) - anv_batch_emit(&cmd_buffer->batch, GEN7_MI_NOOP, noop); + anv_batch_emit(&cmd_buffer->batch, GEN8_MI_NOOP, noop); cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY; } @@ -826,7 +846,7 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer) } } -static inline VkResult +static VkResult anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer, struct list_head *list) { @@ -928,63 +948,98 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, &secondary->surface_relocs, 0); } +struct anv_execbuf { + struct drm_i915_gem_execbuffer2 execbuf; + + struct drm_i915_gem_exec_object2 * objects; + uint32_t bo_count; + struct anv_bo ** bos; + + /* Allocated length of the 'objects' and 'bos' arrays */ + uint32_t array_length; + + uint32_t fence_count; + uint32_t fence_array_length; + struct drm_i915_gem_exec_fence * fences; + struct anv_syncobj ** syncobjs; +}; + +static void +anv_execbuf_init(struct anv_execbuf *exec) +{ + memset(exec, 0, sizeof(*exec)); +} + +static void +anv_execbuf_finish(struct anv_execbuf *exec, + const VkAllocationCallbacks *alloc) +{ + vk_free(alloc, exec->objects); + vk_free(alloc, exec->bos); + vk_free(alloc, exec->fences); + vk_free(alloc, exec->syncobjs); +} + static VkResult -anv_cmd_buffer_add_bo(struct anv_cmd_buffer *cmd_buffer, - struct anv_bo *bo, - struct anv_reloc_list *relocs) +anv_execbuf_add_bo(struct anv_execbuf *exec, + struct anv_bo *bo, + struct anv_reloc_list *relocs, + uint32_t extra_flags, + const VkAllocationCallbacks *alloc) { struct drm_i915_gem_exec_object2 *obj = NULL; - if (bo->index < cmd_buffer->execbuf2.bo_count && - cmd_buffer->execbuf2.bos[bo->index] == bo) - obj = &cmd_buffer->execbuf2.objects[bo->index]; + if (bo->index < exec->bo_count && exec->bos[bo->index] == bo) + obj = &exec->objects[bo->index]; if (obj == NULL) { /* We've never seen this one before. Add it to the list and assign * an id that we can use later. */ - if (cmd_buffer->execbuf2.bo_count >= cmd_buffer->execbuf2.array_length) { - uint32_t new_len = cmd_buffer->execbuf2.objects ? - cmd_buffer->execbuf2.array_length * 2 : 64; + if (exec->bo_count >= exec->array_length) { + uint32_t new_len = exec->objects ? exec->array_length * 2 : 64; struct drm_i915_gem_exec_object2 *new_objects = - vk_alloc(&cmd_buffer->pool->alloc, new_len * sizeof(*new_objects), - 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + vk_alloc(alloc, new_len * sizeof(*new_objects), + 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (new_objects == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); struct anv_bo **new_bos = - vk_alloc(&cmd_buffer->pool->alloc, new_len * sizeof(*new_bos), - 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + vk_alloc(alloc, new_len * sizeof(*new_bos), + 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (new_bos == NULL) { - vk_free(&cmd_buffer->pool->alloc, new_objects); + vk_free(alloc, new_objects); return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } - if (cmd_buffer->execbuf2.objects) { - memcpy(new_objects, cmd_buffer->execbuf2.objects, - cmd_buffer->execbuf2.bo_count * sizeof(*new_objects)); - memcpy(new_bos, cmd_buffer->execbuf2.bos, - cmd_buffer->execbuf2.bo_count * sizeof(*new_bos)); + if (exec->objects) { + memcpy(new_objects, exec->objects, + exec->bo_count * sizeof(*new_objects)); + memcpy(new_bos, exec->bos, + exec->bo_count * sizeof(*new_bos)); } - cmd_buffer->execbuf2.objects = new_objects; - cmd_buffer->execbuf2.bos = new_bos; - cmd_buffer->execbuf2.array_length = new_len; + vk_free(alloc, exec->objects); + vk_free(alloc, exec->bos); + + exec->objects = new_objects; + exec->bos = new_bos; + exec->array_length = new_len; } - assert(cmd_buffer->execbuf2.bo_count < cmd_buffer->execbuf2.array_length); + assert(exec->bo_count < exec->array_length); - bo->index = cmd_buffer->execbuf2.bo_count++; - obj = &cmd_buffer->execbuf2.objects[bo->index]; - cmd_buffer->execbuf2.bos[bo->index] = bo; + bo->index = exec->bo_count++; + obj = &exec->objects[bo->index]; + exec->bos[bo->index] = bo; obj->handle = bo->gem_handle; obj->relocation_count = 0; obj->relocs_ptr = 0; obj->alignment = 0; obj->offset = bo->offset; - obj->flags = bo->is_winsys_bo ? EXEC_OBJECT_WRITE : 0; + obj->flags = bo->flags | extra_flags; obj->rsvd1 = 0; obj->rsvd2 = 0; } @@ -998,95 +1053,100 @@ anv_cmd_buffer_add_bo(struct anv_cmd_buffer *cmd_buffer, obj->relocs_ptr = (uintptr_t) relocs->relocs; for (size_t i = 0; i < relocs->num_relocs; i++) { + VkResult result; + /* A quick sanity check on relocations */ assert(relocs->relocs[i].offset < bo->size); - anv_cmd_buffer_add_bo(cmd_buffer, relocs->reloc_bos[i], NULL); + result = anv_execbuf_add_bo(exec, relocs->reloc_bos[i], NULL, + extra_flags, alloc); + + if (result != VK_SUCCESS) + return result; } } return VK_SUCCESS; } -static void -anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer, - struct anv_reloc_list *list) +static VkResult +anv_execbuf_add_syncobj(struct anv_execbuf *exec, + uint32_t handle, uint32_t flags, + const VkAllocationCallbacks *alloc) { - struct anv_bo *bo; - - /* If the kernel supports I915_EXEC_NO_RELOC, it will compare offset in - * struct drm_i915_gem_exec_object2 against the bos current offset and if - * all bos haven't moved it will skip relocation processing alltogether. - * If I915_EXEC_NO_RELOC is not supported, the kernel ignores the incoming - * value of offset so we can set it either way. For that to work we need - * to make sure all relocs use the same presumed offset. - */ + assert(flags != 0); - for (size_t i = 0; i < list->num_relocs; i++) { - bo = list->reloc_bos[i]; - if (bo->offset != list->relocs[i].presumed_offset) - cmd_buffer->execbuf2.need_reloc = true; + if (exec->fence_count >= exec->fence_array_length) { + uint32_t new_len = MAX2(exec->fence_array_length * 2, 64); - list->relocs[i].target_handle = bo->index; + exec->fences = vk_realloc(alloc, exec->fences, + new_len * sizeof(*exec->fences), + 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (exec->fences == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + exec->fence_array_length = new_len; } + + exec->fences[exec->fence_count] = (struct drm_i915_gem_exec_fence) { + .handle = handle, + .flags = flags, + }; + + exec->fence_count++; + + return VK_SUCCESS; } -static uint64_t -read_reloc(const struct anv_device *device, const void *p) +static void +anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer, + struct anv_reloc_list *list) { - if (device->info.gen >= 8) - return *(uint64_t *)p; - else - return *(uint32_t *)p; + for (size_t i = 0; i < list->num_relocs; i++) + list->relocs[i].target_handle = list->reloc_bos[i]->index; } static void -write_reloc(const struct anv_device *device, void *p, uint64_t v) +write_reloc(const struct anv_device *device, void *p, uint64_t v, bool flush) { - if (device->info.gen >= 8) - *(uint64_t *)p = v; - else + unsigned reloc_size = 0; + if (device->info.gen >= 8) { + reloc_size = sizeof(uint64_t); + *(uint64_t *)p = gen_canonical_address(v); + } else { + reloc_size = sizeof(uint32_t); *(uint32_t *)p = v; + } + + if (flush && !device->info.has_llc) + gen_flush_range(p, reloc_size); } static void -adjust_relocations_from_block_pool(struct anv_block_pool *pool, - struct anv_reloc_list *relocs) +adjust_relocations_from_state_pool(struct anv_state_pool *pool, + struct anv_reloc_list *relocs, + uint32_t last_pool_center_bo_offset) { - for (size_t i = 0; i < relocs->num_relocs; i++) { - /* In general, we don't know how stale the relocated value is. It - * may have been used last time or it may not. Since we don't want - * to stomp it while the GPU may be accessing it, we haven't updated - * it anywhere else in the code. Instead, we just set the presumed - * offset to what it is now based on the delta and the data in the - * block pool. Then the kernel will update it for us if needed. - */ - assert(relocs->relocs[i].offset < pool->state.end); - const void *p = pool->map + relocs->relocs[i].offset; - - /* We're reading back the relocated value from potentially incoherent - * memory here. However, any change to the value will be from the kernel - * writing out relocations, which will keep the CPU cache up to date. - */ - relocs->relocs[i].presumed_offset = - read_reloc(pool->device, p) - relocs->relocs[i].delta; + assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset); + uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset; + for (size_t i = 0; i < relocs->num_relocs; i++) { /* All of the relocations from this block pool to other BO's should * have been emitted relative to the surface block pool center. We * need to add the center offset to make them relative to the * beginning of the actual GEM bo. */ - relocs->relocs[i].offset += pool->center_bo_offset; + relocs->relocs[i].offset += delta; } } static void -adjust_relocations_to_block_pool(struct anv_block_pool *pool, +adjust_relocations_to_state_pool(struct anv_state_pool *pool, struct anv_bo *from_bo, struct anv_reloc_list *relocs, - uint32_t *last_pool_center_bo_offset) + uint32_t last_pool_center_bo_offset) { - assert(*last_pool_center_bo_offset <= pool->center_bo_offset); - uint32_t delta = pool->center_bo_offset - *last_pool_center_bo_offset; + assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset); + uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset; /* When we initially emit relocations into a block pool, we don't * actually know what the final center_bo_offset will be so we just emit @@ -1095,7 +1155,7 @@ adjust_relocations_to_block_pool(struct anv_block_pool *pool, * relocations that point to the pool bo with the correct offset. */ for (size_t i = 0; i < relocs->num_relocs; i++) { - if (relocs->reloc_bos[i] == &pool->bo) { + if (relocs->reloc_bos[i] == &pool->block_pool.bo) { /* Adjust the delta value in the relocation to correctly * correspond to the new delta. Initially, this value may have * been negative (if treated as unsigned), but we trust in @@ -1109,39 +1169,152 @@ adjust_relocations_to_block_pool(struct anv_block_pool *pool, * use by the GPU at the moment. */ assert(relocs->relocs[i].offset < from_bo->size); - write_reloc(pool->device, from_bo->map + relocs->relocs[i].offset, + write_reloc(pool->block_pool.device, + from_bo->map + relocs->relocs[i].offset, relocs->relocs[i].presumed_offset + - relocs->relocs[i].delta); + relocs->relocs[i].delta, false); } } +} - *last_pool_center_bo_offset = pool->center_bo_offset; +static void +anv_reloc_list_apply(struct anv_device *device, + struct anv_reloc_list *list, + struct anv_bo *bo, + bool always_relocate) +{ + for (size_t i = 0; i < list->num_relocs; i++) { + struct anv_bo *target_bo = list->reloc_bos[i]; + if (list->relocs[i].presumed_offset == target_bo->offset && + !always_relocate) + continue; + + void *p = bo->map + list->relocs[i].offset; + write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true); + list->relocs[i].presumed_offset = target_bo->offset; + } } -void -anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer) +/** + * This function applies the relocation for a command buffer and writes the + * actual addresses into the buffers as per what we were told by the kernel on + * the previous execbuf2 call. This should be safe to do because, for each + * relocated address, we have two cases: + * + * 1) The target BO is inactive (as seen by the kernel). In this case, it is + * not in use by the GPU so updating the address is 100% ok. It won't be + * in-use by the GPU (from our context) again until the next execbuf2 + * happens. If the kernel decides to move it in the next execbuf2, it + * will have to do the relocations itself, but that's ok because it should + * have all of the information needed to do so. + * + * 2) The target BO is active (as seen by the kernel). In this case, it + * hasn't moved since the last execbuffer2 call because GTT shuffling + * *only* happens when the BO is idle. (From our perspective, it only + * happens inside the execbuffer2 ioctl, but the shuffling may be + * triggered by another ioctl, with full-ppgtt this is limited to only + * execbuffer2 ioctls on the same context, or memory pressure.) Since the + * target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT + * address and the relocated value we are writing into the BO will be the + * same as the value that is already there. + * + * There is also a possibility that the target BO is active but the exact + * RENDER_SURFACE_STATE object we are writing the relocation into isn't in + * use. In this case, the address currently in the RENDER_SURFACE_STATE + * may be stale but it's still safe to write the relocation because that + * particular RENDER_SURFACE_STATE object isn't in-use by the GPU and + * won't be until the next execbuf2 call. + * + * By doing relocations on the CPU, we can tell the kernel that it doesn't + * need to bother. We want to do this because the surface state buffer is + * used by every command buffer so, if the kernel does the relocations, it + * will always be busy and the kernel will always stall. This is also + * probably the fastest mechanism for doing relocations since the kernel would + * have to make a full copy of all the relocations lists. + */ +static bool +relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, + struct anv_execbuf *exec) { - struct anv_batch *batch = &cmd_buffer->batch; - struct anv_block_pool *ss_pool = - &cmd_buffer->device->surface_state_block_pool; + static int userspace_relocs = -1; + if (userspace_relocs < 0) + userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true); + if (!userspace_relocs) + return false; + + /* First, we have to check to see whether or not we can even do the + * relocation. New buffers which have never been submitted to the kernel + * don't have a valid offset so we need to let the kernel do relocations so + * that we can get offsets for them. On future execbuf2 calls, those + * buffers will have offsets and we will be able to skip relocating. + * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1. + */ + for (uint32_t i = 0; i < exec->bo_count; i++) { + if (exec->bos[i]->offset == (uint64_t)-1) + return false; + } - cmd_buffer->execbuf2.bo_count = 0; - cmd_buffer->execbuf2.need_reloc = false; + /* Since surface states are shared between command buffers and we don't + * know what order they will be submitted to the kernel, we don't know + * what address is actually written in the surface state object at any + * given time. The only option is to always relocate them. + */ + anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs, + &cmd_buffer->device->surface_state_pool.block_pool.bo, + true /* always relocate surface states */); - adjust_relocations_from_block_pool(ss_pool, &cmd_buffer->surface_relocs); - anv_cmd_buffer_add_bo(cmd_buffer, &ss_pool->bo, &cmd_buffer->surface_relocs); + /* Since we own all of the batch buffers, we know what values are stored + * in the relocated addresses and only have to update them if the offsets + * have changed. + */ + struct anv_batch_bo **bbo; + u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { + anv_reloc_list_apply(cmd_buffer->device, + &(*bbo)->relocs, &(*bbo)->bo, false); + } + + for (uint32_t i = 0; i < exec->bo_count; i++) + exec->objects[i].offset = exec->bos[i]->offset; + + return true; +} + +static VkResult +setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, + struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_batch *batch = &cmd_buffer->batch; + struct anv_state_pool *ss_pool = + &cmd_buffer->device->surface_state_pool; + + adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs, + cmd_buffer->last_ss_pool_center); + VkResult result = anv_execbuf_add_bo(execbuf, &ss_pool->block_pool.bo, + &cmd_buffer->surface_relocs, 0, + &cmd_buffer->device->alloc); + if (result != VK_SUCCESS) + return result; /* First, we walk over all of the bos we've seen and add them and their * relocations to the validate list. */ struct anv_batch_bo **bbo; u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { - adjust_relocations_to_block_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs, - &(*bbo)->last_ss_pool_bo_offset); + adjust_relocations_to_state_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs, + cmd_buffer->last_ss_pool_center); - anv_cmd_buffer_add_bo(cmd_buffer, &(*bbo)->bo, &(*bbo)->relocs); + result = anv_execbuf_add_bo(execbuf, &(*bbo)->bo, &(*bbo)->relocs, 0, + &cmd_buffer->device->alloc); + if (result != VK_SUCCESS) + return result; } + /* Now that we've adjusted all of the surface state relocations, we need to + * record the surface state pool center so future executions of the command + * buffer can adjust correctly. + */ + cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset; + struct anv_batch_bo *first_batch_bo = list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link); @@ -1150,20 +1323,19 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer) * corresponding to the first batch_bo in the chain with the last * element in the list. */ - if (first_batch_bo->bo.index != cmd_buffer->execbuf2.bo_count - 1) { + if (first_batch_bo->bo.index != execbuf->bo_count - 1) { uint32_t idx = first_batch_bo->bo.index; - uint32_t last_idx = cmd_buffer->execbuf2.bo_count - 1; + uint32_t last_idx = execbuf->bo_count - 1; - struct drm_i915_gem_exec_object2 tmp_obj = - cmd_buffer->execbuf2.objects[idx]; - assert(cmd_buffer->execbuf2.bos[idx] == &first_batch_bo->bo); + struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx]; + assert(execbuf->bos[idx] == &first_batch_bo->bo); - cmd_buffer->execbuf2.objects[idx] = cmd_buffer->execbuf2.objects[last_idx]; - cmd_buffer->execbuf2.bos[idx] = cmd_buffer->execbuf2.bos[last_idx]; - cmd_buffer->execbuf2.bos[idx]->index = idx; + execbuf->objects[idx] = execbuf->objects[last_idx]; + execbuf->bos[idx] = execbuf->bos[last_idx]; + execbuf->bos[idx]->index = idx; - cmd_buffer->execbuf2.objects[last_idx] = tmp_obj; - cmd_buffer->execbuf2.bos[last_idx] = &first_batch_bo->bo; + execbuf->objects[last_idx] = tmp_obj; + execbuf->bos[last_idx] = &first_batch_bo->bo; first_batch_bo->bo.index = last_idx; } @@ -1184,29 +1356,303 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer) } } - cmd_buffer->execbuf2.execbuf = (struct drm_i915_gem_execbuffer2) { - .buffers_ptr = (uintptr_t) cmd_buffer->execbuf2.objects, - .buffer_count = cmd_buffer->execbuf2.bo_count, + execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf->objects, + .buffer_count = execbuf->bo_count, .batch_start_offset = 0, .batch_len = batch->next - batch->start, .cliprects_ptr = 0, .num_cliprects = 0, .DR1 = 0, .DR4 = 0, - .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER | - I915_EXEC_CONSTANTS_REL_GENERAL, + .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER, .rsvd1 = cmd_buffer->device->context_id, .rsvd2 = 0, }; - if (!cmd_buffer->execbuf2.need_reloc) - cmd_buffer->execbuf2.execbuf.flags |= I915_EXEC_NO_RELOC; + if (relocate_cmd_buffer(cmd_buffer, execbuf)) { + /* If we were able to successfully relocate everything, tell the kernel + * that it can skip doing relocations. The requirement for using + * NO_RELOC is: + * + * 1) The addresses written in the objects must match the corresponding + * reloc.presumed_offset which in turn must match the corresponding + * execobject.offset. + * + * 2) To avoid stalling, execobject.offset should match the current + * address of that object within the active context. + * + * In order to satisfy all of the invariants that make userspace + * relocations to be safe (see relocate_cmd_buffer()), we need to + * further ensure that the addresses we use match those used by the + * kernel for the most recent execbuf2. + * + * The kernel may still choose to do relocations anyway if something has + * moved in the GTT. In this case, the relocation list still needs to be + * valid. All relocations on the batch buffers are already valid and + * kept up-to-date. For surface state relocations, by applying the + * relocations in relocate_cmd_buffer, we ensured that the address in + * the RENDER_SURFACE_STATE matches presumed_offset, so it should be + * safe for the kernel to relocate them as needed. + */ + execbuf->execbuf.flags |= I915_EXEC_NO_RELOC; + } else { + /* In the case where we fall back to doing kernel relocations, we need + * to ensure that the relocation list is valid. All relocations on the + * batch buffers are already valid and kept up-to-date. Since surface + * states are shared between command buffers and we don't know what + * order they will be submitted to the kernel, we don't know what + * address is actually written in the surface state object at any given + * time. The only option is to set a bogus presumed offset and let the + * kernel relocate them. + */ + for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++) + cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1; + } + + return VK_SUCCESS; +} + +static VkResult +setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_device *device) +{ + VkResult result = anv_execbuf_add_bo(execbuf, &device->trivial_batch_bo, + NULL, 0, &device->alloc); + if (result != VK_SUCCESS) + return result; + + execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf->objects, + .buffer_count = execbuf->bo_count, + .batch_start_offset = 0, + .batch_len = 8, /* GEN7_MI_BATCH_BUFFER_END and NOOP */ + .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER, + .rsvd1 = device->context_id, + .rsvd2 = 0, + }; + + return VK_SUCCESS; } VkResult anv_cmd_buffer_execbuf(struct anv_device *device, - struct anv_cmd_buffer *cmd_buffer) + struct anv_cmd_buffer *cmd_buffer, + const VkSemaphore *in_semaphores, + uint32_t num_in_semaphores, + const VkSemaphore *out_semaphores, + uint32_t num_out_semaphores, + VkFence _fence) { - return anv_device_execbuf(device, &cmd_buffer->execbuf2.execbuf, - cmd_buffer->execbuf2.bos); + ANV_FROM_HANDLE(anv_fence, fence, _fence); + + struct anv_execbuf execbuf; + anv_execbuf_init(&execbuf); + + int in_fence = -1; + VkResult result = VK_SUCCESS; + for (uint32_t i = 0; i < num_in_semaphores; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); + struct anv_semaphore_impl *impl = + semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + switch (impl->type) { + case ANV_SEMAPHORE_TYPE_BO: + result = anv_execbuf_add_bo(&execbuf, impl->bo, NULL, + 0, &device->alloc); + if (result != VK_SUCCESS) + return result; + break; + + case ANV_SEMAPHORE_TYPE_SYNC_FILE: + if (in_fence == -1) { + in_fence = impl->fd; + } else { + int merge = anv_gem_sync_file_merge(device, in_fence, impl->fd); + if (merge == -1) + return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); + + close(impl->fd); + close(in_fence); + in_fence = merge; + } + + impl->fd = -1; + break; + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: + result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, + I915_EXEC_FENCE_WAIT, + &device->alloc); + if (result != VK_SUCCESS) + return result; + break; + + default: + break; + } + } + + bool need_out_fence = false; + for (uint32_t i = 0; i < num_out_semaphores; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); + + /* Under most circumstances, out fences won't be temporary. However, + * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + * + * The spec says nothing whatsoever about signal operations on + * temporarily imported semaphores so it appears they are allowed. + * There are also CTS tests that require this to work. + */ + struct anv_semaphore_impl *impl = + semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + switch (impl->type) { + case ANV_SEMAPHORE_TYPE_BO: + result = anv_execbuf_add_bo(&execbuf, impl->bo, NULL, + EXEC_OBJECT_WRITE, &device->alloc); + if (result != VK_SUCCESS) + return result; + break; + + case ANV_SEMAPHORE_TYPE_SYNC_FILE: + need_out_fence = true; + break; + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: + result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, + I915_EXEC_FENCE_SIGNAL, + &device->alloc); + if (result != VK_SUCCESS) + return result; + break; + + default: + break; + } + } + + if (fence) { + /* Under most circumstances, out fences won't be temporary. However, + * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + * + * The spec says nothing whatsoever about signal operations on + * temporarily imported semaphores so it appears they are allowed. + * There are also CTS tests that require this to work. + */ + struct anv_fence_impl *impl = + fence->temporary.type != ANV_FENCE_TYPE_NONE ? + &fence->temporary : &fence->permanent; + + switch (impl->type) { + case ANV_FENCE_TYPE_BO: + result = anv_execbuf_add_bo(&execbuf, &impl->bo.bo, NULL, + EXEC_OBJECT_WRITE, &device->alloc); + if (result != VK_SUCCESS) + return result; + break; + + case ANV_FENCE_TYPE_SYNCOBJ: + result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, + I915_EXEC_FENCE_SIGNAL, + &device->alloc); + if (result != VK_SUCCESS) + return result; + break; + + default: + unreachable("Invalid fence type"); + } + } + + if (cmd_buffer) + result = setup_execbuf_for_cmd_buffer(&execbuf, cmd_buffer); + else + result = setup_empty_execbuf(&execbuf, device); + + if (result != VK_SUCCESS) + return result; + + if (execbuf.fence_count > 0) { + assert(device->instance->physicalDevice.has_syncobj); + execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY; + execbuf.execbuf.num_cliprects = execbuf.fence_count; + execbuf.execbuf.cliprects_ptr = (uintptr_t) execbuf.fences; + } + + if (in_fence != -1) { + execbuf.execbuf.flags |= I915_EXEC_FENCE_IN; + execbuf.execbuf.rsvd2 |= (uint32_t)in_fence; + } + + if (need_out_fence) + execbuf.execbuf.flags |= I915_EXEC_FENCE_OUT; + + result = anv_device_execbuf(device, &execbuf.execbuf, execbuf.bos); + + /* Execbuf does not consume the in_fence. It's our job to close it. */ + if (in_fence != -1) + close(in_fence); + + for (uint32_t i = 0; i < num_in_semaphores; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); + /* From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + * + * This has to happen after the execbuf in case we close any syncobjs in + * the process. + */ + anv_semaphore_reset_temporary(device, semaphore); + } + + if (fence && fence->permanent.type == ANV_FENCE_TYPE_BO) { + /* BO fences can't be shared, so they can't be temporary. */ + assert(fence->temporary.type == ANV_FENCE_TYPE_NONE); + + /* Once the execbuf has returned, we need to set the fence state to + * SUBMITTED. We can't do this before calling execbuf because + * anv_GetFenceStatus does take the global device lock before checking + * fence->state. + * + * We set the fence state to SUBMITTED regardless of whether or not the + * execbuf succeeds because we need to ensure that vkWaitForFences() and + * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or + * VK_SUCCESS) in a finite amount of time even if execbuf fails. + */ + fence->permanent.bo.state = ANV_BO_FENCE_STATE_SUBMITTED; + } + + if (result == VK_SUCCESS && need_out_fence) { + int out_fence = execbuf.execbuf.rsvd2 >> 32; + for (uint32_t i = 0; i < num_out_semaphores; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); + /* Out fences can't have temporary state because that would imply + * that we imported a sync file and are trying to signal it. + */ + assert(semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE); + struct anv_semaphore_impl *impl = &semaphore->permanent; + + if (impl->type == ANV_SEMAPHORE_TYPE_SYNC_FILE) { + assert(impl->fd == -1); + impl->fd = dup(out_fence); + } + } + close(out_fence); + } + + anv_execbuf_finish(&execbuf, &device->alloc); + + return result; }