X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fvulkan%2Fanv_batch_chain.c;h=ce429ff2d85536da878066f01b1552779f70087e;hb=e94c22429b64f419d9a66f04fa5ecdad33f7f5ef;hp=e08e07ad7bd7062416c9c54256e3f92c8f09e755;hpb=f5bab06428fc7ca6116cf0daf1c237eb86202e7a;p=mesa.git diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index e08e07ad7bd..ce429ff2d85 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -30,6 +30,8 @@ #include "anv_private.h" #include "genxml/gen8_pack.h" +#include "genxml/genX_bits.h" +#include "perf/gen_perf.h" #include "util/debug.h" @@ -46,63 +48,59 @@ * Functions related to anv_reloc_list *-----------------------------------------------------------------------*/ +VkResult +anv_reloc_list_init(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc) +{ + memset(list, 0, sizeof(*list)); + return VK_SUCCESS; +} + static VkResult anv_reloc_list_init_clone(struct anv_reloc_list *list, const VkAllocationCallbacks *alloc, const struct anv_reloc_list *other_list) { - if (other_list) { - list->num_relocs = other_list->num_relocs; - list->array_length = other_list->array_length; - } else { - list->num_relocs = 0; - list->array_length = 256; - } - - list->relocs = - vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - - if (list->relocs == NULL) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - - list->reloc_bos = - vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - - if (list->reloc_bos == NULL) { - vk_free(alloc, list->relocs); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - } - - list->deps = _mesa_set_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); + list->num_relocs = other_list->num_relocs; + list->array_length = other_list->array_length; + + if (list->num_relocs > 0) { + list->relocs = + vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (list->relocs == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - if (!list->deps) { - vk_free(alloc, list->relocs); - vk_free(alloc, list->reloc_bos); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - } + list->reloc_bos = + vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (list->reloc_bos == NULL) { + vk_free(alloc, list->relocs); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } - if (other_list) { memcpy(list->relocs, other_list->relocs, list->array_length * sizeof(*list->relocs)); memcpy(list->reloc_bos, other_list->reloc_bos, list->array_length * sizeof(*list->reloc_bos)); - struct set_entry *entry; - set_foreach(other_list->deps, entry) { - _mesa_set_add_pre_hashed(list->deps, entry->hash, entry->key); - } + } else { + list->relocs = NULL; + list->reloc_bos = NULL; } - return VK_SUCCESS; -} + list->dep_words = other_list->dep_words; -VkResult -anv_reloc_list_init(struct anv_reloc_list *list, - const VkAllocationCallbacks *alloc) -{ - return anv_reloc_list_init_clone(list, alloc, NULL); + if (list->dep_words > 0) { + list->deps = + vk_alloc(alloc, list->dep_words * sizeof(BITSET_WORD), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + memcpy(list->deps, other_list->deps, + list->dep_words * sizeof(BITSET_WORD)); + } else { + list->deps = NULL; + } + + return VK_SUCCESS; } void @@ -111,7 +109,7 @@ anv_reloc_list_finish(struct anv_reloc_list *list, { vk_free(alloc, list->relocs); vk_free(alloc, list->reloc_bos); - _mesa_set_destroy(list->deps, NULL); + vk_free(alloc, list->deps); } static VkResult @@ -122,48 +120,79 @@ anv_reloc_list_grow(struct anv_reloc_list *list, if (list->num_relocs + num_additional_relocs <= list->array_length) return VK_SUCCESS; - size_t new_length = list->array_length * 2; + size_t new_length = MAX2(16, list->array_length * 2); while (new_length < list->num_relocs + num_additional_relocs) new_length *= 2; struct drm_i915_gem_relocation_entry *new_relocs = - vk_alloc(alloc, new_length * sizeof(*list->relocs), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + vk_realloc(alloc, list->relocs, + new_length * sizeof(*list->relocs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (new_relocs == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + list->relocs = new_relocs; struct anv_bo **new_reloc_bos = - vk_alloc(alloc, new_length * sizeof(*list->reloc_bos), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (new_reloc_bos == NULL) { - vk_free(alloc, new_relocs); + vk_realloc(alloc, list->reloc_bos, + new_length * sizeof(*list->reloc_bos), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (new_reloc_bos == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - } + list->reloc_bos = new_reloc_bos; - memcpy(new_relocs, list->relocs, list->num_relocs * sizeof(*list->relocs)); - memcpy(new_reloc_bos, list->reloc_bos, - list->num_relocs * sizeof(*list->reloc_bos)); + list->array_length = new_length; - vk_free(alloc, list->relocs); - vk_free(alloc, list->reloc_bos); + return VK_SUCCESS; +} - list->array_length = new_length; - list->relocs = new_relocs; - list->reloc_bos = new_reloc_bos; +static VkResult +anv_reloc_list_grow_deps(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc, + uint32_t min_num_words) +{ + if (min_num_words <= list->dep_words) + return VK_SUCCESS; + + uint32_t new_length = MAX2(32, list->dep_words * 2); + while (new_length < min_num_words) + new_length *= 2; + + BITSET_WORD *new_deps = + vk_realloc(alloc, list->deps, new_length * sizeof(BITSET_WORD), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (new_deps == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + list->deps = new_deps; + + /* Zero out the new data */ + memset(list->deps + list->dep_words, 0, + (new_length - list->dep_words) * sizeof(BITSET_WORD)); + list->dep_words = new_length; return VK_SUCCESS; } +#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) + VkResult anv_reloc_list_add(struct anv_reloc_list *list, const VkAllocationCallbacks *alloc, - uint32_t offset, struct anv_bo *target_bo, uint32_t delta) + uint32_t offset, struct anv_bo *target_bo, uint32_t delta, + uint64_t *address_u64_out) { struct drm_i915_gem_relocation_entry *entry; int index; - if (target_bo->flags & EXEC_OBJECT_PINNED) { - _mesa_set_add(list->deps, target_bo); + struct anv_bo *unwrapped_target_bo = anv_bo_unwrap(target_bo); + uint64_t target_bo_offset = READ_ONCE(unwrapped_target_bo->offset); + if (address_u64_out) + *address_u64_out = target_bo_offset + delta; + + if (unwrapped_target_bo->flags & EXEC_OBJECT_PINNED) { + assert(!target_bo->is_wrapper); + uint32_t idx = unwrapped_target_bo->gem_handle; + anv_reloc_list_grow_deps(list, alloc, (idx / BITSET_WORDBITS) + 1); + BITSET_SET(list->deps, unwrapped_target_bo->gem_handle); return VK_SUCCESS; } @@ -175,10 +204,10 @@ anv_reloc_list_add(struct anv_reloc_list *list, index = list->num_relocs++; list->reloc_bos[index] = target_bo; entry = &list->relocs[index]; - entry->target_handle = target_bo->gem_handle; + entry->target_handle = -1; /* See also anv_cmd_buffer_process_relocs() */ entry->delta = delta; entry->offset = offset; - entry->presumed_offset = target_bo->offset; + entry->presumed_offset = target_bo_offset; entry->read_domains = 0; entry->write_domain = 0; VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry))); @@ -186,6 +215,14 @@ anv_reloc_list_add(struct anv_reloc_list *list, return VK_SUCCESS; } +static void +anv_reloc_list_clear(struct anv_reloc_list *list) +{ + list->num_relocs = 0; + if (list->dep_words > 0) + memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD)); +} + static VkResult anv_reloc_list_append(struct anv_reloc_list *list, const VkAllocationCallbacks *alloc, @@ -195,21 +232,22 @@ anv_reloc_list_append(struct anv_reloc_list *list, if (result != VK_SUCCESS) return result; - memcpy(&list->relocs[list->num_relocs], &other->relocs[0], - other->num_relocs * sizeof(other->relocs[0])); - memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0], - other->num_relocs * sizeof(other->reloc_bos[0])); - - for (uint32_t i = 0; i < other->num_relocs; i++) - list->relocs[i + list->num_relocs].offset += offset; + if (other->num_relocs > 0) { + memcpy(&list->relocs[list->num_relocs], &other->relocs[0], + other->num_relocs * sizeof(other->relocs[0])); + memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0], + other->num_relocs * sizeof(other->reloc_bos[0])); - list->num_relocs += other->num_relocs; + for (uint32_t i = 0; i < other->num_relocs; i++) + list->relocs[i + list->num_relocs].offset += offset; - struct set_entry *entry; - set_foreach(other->deps, entry) { - _mesa_set_add_pre_hashed(list->deps, entry->hash, entry->key); + list->num_relocs += other->num_relocs; } + anv_reloc_list_grow_deps(list, alloc, other->dep_words); + for (uint32_t w = 0; w < other->dep_words; w++) + list->deps[w] |= other->deps[w]; + return VK_SUCCESS; } @@ -240,14 +278,27 @@ uint64_t anv_batch_emit_reloc(struct anv_batch *batch, void *location, struct anv_bo *bo, uint32_t delta) { + uint64_t address_u64 = 0; VkResult result = anv_reloc_list_add(batch->relocs, batch->alloc, - location - batch->start, bo, delta); + location - batch->start, bo, delta, + &address_u64); if (result != VK_SUCCESS) { anv_batch_set_error(batch, result); return 0; } - return bo->offset + delta; + return address_u64; +} + +struct anv_address +anv_batch_address(struct anv_batch *batch, void *batch_location) +{ + assert(batch->start < batch_location); + + /* Allow a jump at the current location of the batch. */ + assert(batch->next >= batch_location); + + return anv_address_add(batch->start_addr, batch_location - batch->start); } void @@ -297,8 +348,8 @@ anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer, if (bbo == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, &bbo->bo, - ANV_CMD_BUFFER_BATCH_SIZE); + result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, + ANV_CMD_BUFFER_BATCH_SIZE, &bbo->bo); if (result != VK_SUCCESS) goto fail_alloc; @@ -311,7 +362,7 @@ anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer, return VK_SUCCESS; fail_bo_alloc: - anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); fail_alloc: vk_free(&cmd_buffer->pool->alloc, bbo); @@ -330,8 +381,8 @@ anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer, if (bbo == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, &bbo->bo, - other_bbo->bo.size); + result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, + other_bbo->bo->size, &bbo->bo); if (result != VK_SUCCESS) goto fail_alloc; @@ -341,14 +392,13 @@ anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer, goto fail_bo_alloc; bbo->length = other_bbo->length; - memcpy(bbo->bo.map, other_bbo->bo.map, other_bbo->length); - + memcpy(bbo->bo->map, other_bbo->bo->map, other_bbo->length); *bbo_out = bbo; return VK_SUCCESS; fail_bo_alloc: - anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); fail_alloc: vk_free(&cmd_buffer->pool->alloc, bbo); @@ -359,27 +409,27 @@ static void anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch, size_t batch_padding) { - batch->next = batch->start = bbo->bo.map; - batch->end = bbo->bo.map + bbo->bo.size - batch_padding; + anv_batch_set_storage(batch, (struct anv_address) { .bo = bbo->bo, }, + bbo->bo->map, bbo->bo->size - batch_padding); batch->relocs = &bbo->relocs; - bbo->relocs.num_relocs = 0; - _mesa_set_clear(bbo->relocs.deps, NULL); + anv_reloc_list_clear(&bbo->relocs); } static void anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch, size_t batch_padding) { - batch->start = bbo->bo.map; - batch->next = bbo->bo.map + bbo->length; - batch->end = bbo->bo.map + bbo->bo.size - batch_padding; + batch->start_addr = (struct anv_address) { .bo = bbo->bo, }; + batch->start = bbo->bo->map; + batch->next = bbo->bo->map + bbo->length; + batch->end = bbo->bo->map + bbo->bo->size - batch_padding; batch->relocs = &bbo->relocs; } static void anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch) { - assert(batch->start == bbo->bo.map); + assert(batch->start == bbo->bo->map); bbo->length = batch->next - batch->start; VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length)); } @@ -389,25 +439,25 @@ anv_batch_bo_grow(struct anv_cmd_buffer *cmd_buffer, struct anv_batch_bo *bbo, struct anv_batch *batch, size_t aditional, size_t batch_padding) { - assert(batch->start == bbo->bo.map); + assert(batch->start == bbo->bo->map); bbo->length = batch->next - batch->start; - size_t new_size = bbo->bo.size; + size_t new_size = bbo->bo->size; while (new_size <= bbo->length + aditional + batch_padding) new_size *= 2; - if (new_size == bbo->bo.size) + if (new_size == bbo->bo->size) return VK_SUCCESS; - struct anv_bo new_bo; + struct anv_bo *new_bo; VkResult result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, - &new_bo, new_size); + new_size, &new_bo); if (result != VK_SUCCESS) return result; - memcpy(new_bo.map, bbo->bo.map, bbo->length); + memcpy(new_bo->map, bbo->bo->map, bbo->length); - anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); bbo->bo = new_bo; anv_batch_bo_continue(bbo, batch, batch_padding); @@ -421,26 +471,26 @@ anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer, struct anv_batch_bo *next_bbo, uint32_t next_bbo_offset) { - MAYBE_UNUSED const uint32_t bb_start_offset = + const uint32_t bb_start_offset = prev_bbo->length - GEN8_MI_BATCH_BUFFER_START_length * 4; - MAYBE_UNUSED const uint32_t *bb_start = prev_bbo->bo.map + bb_start_offset; + ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset; /* Make sure we're looking at a MI_BATCH_BUFFER_START */ assert(((*bb_start >> 29) & 0x07) == 0); assert(((*bb_start >> 23) & 0x3f) == 49); - if (cmd_buffer->device->instance->physicalDevice.use_softpin) { - assert(prev_bbo->bo.flags & EXEC_OBJECT_PINNED); - assert(next_bbo->bo.flags & EXEC_OBJECT_PINNED); + if (cmd_buffer->device->physical->use_softpin) { + assert(prev_bbo->bo->flags & EXEC_OBJECT_PINNED); + assert(next_bbo->bo->flags & EXEC_OBJECT_PINNED); write_reloc(cmd_buffer->device, - prev_bbo->bo.map + bb_start_offset + 4, - next_bbo->bo.offset + next_bbo_offset, true); + prev_bbo->bo->map + bb_start_offset + 4, + next_bbo->bo->offset + next_bbo_offset, true); } else { uint32_t reloc_idx = prev_bbo->relocs.num_relocs - 1; assert(prev_bbo->relocs.relocs[reloc_idx].offset == bb_start_offset + 4); - prev_bbo->relocs.reloc_bos[reloc_idx] = &next_bbo->bo; + prev_bbo->relocs.reloc_bos[reloc_idx] = next_bbo->bo; prev_bbo->relocs.relocs[reloc_idx].delta = next_bbo_offset; /* Use a bogus presumed offset to force a relocation */ @@ -453,7 +503,7 @@ anv_batch_bo_destroy(struct anv_batch_bo *bbo, struct anv_cmd_buffer *cmd_buffer) { anv_reloc_list_finish(&bbo->relocs, &cmd_buffer->pool->alloc); - anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); vk_free(&cmd_buffer->pool->alloc, bbo); } @@ -481,8 +531,10 @@ anv_batch_bo_list_clone(const struct list_head *list, } if (result != VK_SUCCESS) { - list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) + list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) { + list_del(&bbo->link); anv_batch_bo_destroy(bbo, cmd_buffer); + } } return result; @@ -501,10 +553,11 @@ anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer) struct anv_address anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer) { + struct anv_state_pool *pool = anv_binding_table_pool(cmd_buffer->device); struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); return (struct anv_address) { - .bo = &anv_binding_table_pool(cmd_buffer->device)->block_pool.bo, - .offset = bt_block->offset, + .bo = pool->block_pool.bo, + .offset = bt_block->offset - pool->start_offset, }; } @@ -550,9 +603,9 @@ cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer, * chaining command, let's set it back where it should go. */ batch->end += GEN8_MI_BATCH_BUFFER_START_length * 4; - assert(batch->end == current_bbo->bo.map + current_bbo->bo.size); + assert(batch->end == current_bbo->bo->map + current_bbo->bo->size); - emit_batch_buffer_start(cmd_buffer, &bbo->bo, 0); + emit_batch_buffer_start(cmd_buffer, bbo->bo, 0); anv_batch_bo_finish(current_bbo, batch); } @@ -670,30 +723,21 @@ struct anv_state anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer, uint32_t entries, uint32_t *state_offset) { - struct anv_device *device = cmd_buffer->device; - struct anv_state_pool *state_pool = &device->surface_state_pool; struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); - struct anv_state state; - state.alloc_size = align_u32(entries * 4, 32); + uint32_t bt_size = align_u32(entries * 4, 32); - if (cmd_buffer->bt_next + state.alloc_size > state_pool->block_size) + struct anv_state state = cmd_buffer->bt_next; + if (bt_size > state.alloc_size) return (struct anv_state) { 0 }; - state.offset = cmd_buffer->bt_next; - state.map = anv_binding_table_pool(device)->block_pool.map + - bt_block->offset + state.offset; - - cmd_buffer->bt_next += state.alloc_size; + state.alloc_size = bt_size; + cmd_buffer->bt_next.offset += bt_size; + cmd_buffer->bt_next.map += bt_size; + cmd_buffer->bt_next.alloc_size -= bt_size; - if (device->instance->physicalDevice.use_softpin) { - assert(bt_block->offset >= 0); - *state_offset = device->surface_state_pool.block_pool.start_address - - device->binding_table_pool.block_pool.start_address - bt_block->offset; - } else { - assert(bt_block->offset < 0); - *state_offset = -bt_block->offset; - } + assert(bt_block->offset < 0); + *state_offset = -bt_block->offset; return state; } @@ -724,7 +768,12 @@ anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer) } *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device); - cmd_buffer->bt_next = 0; + + /* The bt_next state is a rolling state (we update it as we suballocate + * from it) which is relative to the start of the binding table block. + */ + cmd_buffer->bt_next = *bt_block; + cmd_buffer->bt_next.offset = 0; return VK_SUCCESS; } @@ -807,6 +856,7 @@ anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) /* Destroy all of the batch buffers */ list_for_each_entry_safe(struct anv_batch_bo, bbo, &cmd_buffer->batch_bos, link) { + list_del(&bbo->link); anv_batch_bo_destroy(bbo, cmd_buffer); } } @@ -815,13 +865,13 @@ void anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) { /* Delete all but the first batch bo */ - assert(!list_empty(&cmd_buffer->batch_bos)); + assert(!list_is_empty(&cmd_buffer->batch_bos)); while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) { struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer); list_del(&bbo->link); anv_batch_bo_destroy(bbo, cmd_buffer); } - assert(!list_empty(&cmd_buffer->batch_bos)); + assert(!list_is_empty(&cmd_buffer->batch_bos)); anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer), &cmd_buffer->batch, @@ -832,10 +882,10 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) anv_binding_table_pool_free(cmd_buffer->device, *bt_block); } assert(u_vector_length(&cmd_buffer->bt_block_states) == 1); - cmd_buffer->bt_next = 0; + cmd_buffer->bt_next = *(struct anv_state *)u_vector_head(&cmd_buffer->bt_block_states); + cmd_buffer->bt_next.offset = 0; - cmd_buffer->surface_relocs.num_relocs = 0; - _mesa_set_clear(cmd_buffer->surface_relocs.deps, NULL); + anv_reloc_list_clear(&cmd_buffer->surface_relocs); cmd_buffer->last_ss_pool_center = 0; /* Reset the list of seen buffers */ @@ -859,7 +909,7 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer) * with our BATCH_BUFFER_END in another BO. */ cmd_buffer->batch.end += GEN8_MI_BATCH_BUFFER_START_length * 4; - assert(cmd_buffer->batch.end == batch_bo->bo.map + batch_bo->bo.size); + assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size); anv_batch_emit(&cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_END, bbe); @@ -878,6 +928,34 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer) const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start; if (!cmd_buffer->device->can_chain_batches) { cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT; + } else if (cmd_buffer->device->physical->use_call_secondary) { + cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN; + /* If the secondary command buffer begins & ends in the same BO and + * its length is less than the length of CS prefetch, add some NOOPs + * instructions so the last MI_BATCH_BUFFER_START is outside the CS + * prefetch. + */ + if (cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) { + int32_t batch_len = + cmd_buffer->batch.next - cmd_buffer->batch.start; + + for (int32_t i = 0; i < (512 - batch_len); i += 4) + anv_batch_emit(&cmd_buffer->batch, GEN8_MI_NOOP, noop); + } + + void *jump_addr = + anv_batch_emitn(&cmd_buffer->batch, + GEN8_MI_BATCH_BUFFER_START_length, + GEN8_MI_BATCH_BUFFER_START, + .AddressSpaceIndicator = ASI_PPGTT, + .SecondLevelBatchBuffer = Firstlevelbatch) + + (GEN8_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8); + cmd_buffer->return_addr = anv_batch_address(&cmd_buffer->batch, jump_addr); + + /* The emit above may have caused us to chain batch buffers which + * would mean that batch_bo is no longer valid. + */ + batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer); } else if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) && (length < ANV_CMD_BUFFER_BATCH_SIZE / 2)) { /* If the secondary has exactly one batch buffer in its list *and* @@ -900,11 +978,11 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer) * chaining command, let's set it back where it should go. */ cmd_buffer->batch.end += GEN8_MI_BATCH_BUFFER_START_length * 4; - assert(cmd_buffer->batch.start == batch_bo->bo.map); - assert(cmd_buffer->batch.end == batch_bo->bo.map + batch_bo->bo.size); + assert(cmd_buffer->batch.start == batch_bo->bo->map); + assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size); - emit_batch_buffer_start(cmd_buffer, &batch_bo->bo, 0); - assert(cmd_buffer->batch.start == batch_bo->bo.map); + emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0); + assert(cmd_buffer->batch.start == batch_bo->bo->map); } else { cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN; } @@ -950,10 +1028,10 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, struct anv_batch_bo *last_bbo = list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link); - emit_batch_buffer_start(primary, &first_bbo->bo, 0); + emit_batch_buffer_start(primary, first_bbo->bo, 0); struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary); - assert(primary->batch.start == this_bbo->bo.map); + assert(primary->batch.start == this_bbo->bo->map); uint32_t offset = primary->batch.next - primary->batch.start; /* Make the tail of the secondary point back to right after the @@ -987,6 +1065,26 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, GEN8_MI_BATCH_BUFFER_START_length * 4); break; } + case ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN: { + struct anv_batch_bo *first_bbo = + list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link); + + uint64_t *write_return_addr = + anv_batch_emitn(&primary->batch, + GEN8_MI_STORE_DATA_IMM_length + 1 /* QWord write */, + GEN8_MI_STORE_DATA_IMM, + .Address = secondary->return_addr) + + (GEN8_MI_STORE_DATA_IMM_ImmediateData_start / 8); + + emit_batch_buffer_start(primary, first_bbo->bo, 0); + + *write_return_addr = + anv_address_physical(anv_batch_address(&primary->batch, + primary->batch.next)); + + anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos); + break; + } default: assert(!"Invalid execution mode"); } @@ -998,6 +1096,8 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, struct anv_execbuf { struct drm_i915_gem_execbuffer2 execbuf; + struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences; + struct drm_i915_gem_exec_object2 * objects; uint32_t bo_count; struct anv_bo ** bos; @@ -1007,10 +1107,10 @@ struct anv_execbuf { bool has_relocs; - uint32_t fence_count; - uint32_t fence_array_length; - struct drm_i915_gem_exec_fence * fences; - struct anv_syncobj ** syncobjs; + const VkAllocationCallbacks * alloc; + VkSystemAllocationScope alloc_scope; + + int perf_query_pass; }; static void @@ -1020,33 +1120,48 @@ anv_execbuf_init(struct anv_execbuf *exec) } static void -anv_execbuf_finish(struct anv_execbuf *exec, - const VkAllocationCallbacks *alloc) +anv_execbuf_finish(struct anv_execbuf *exec) { - vk_free(alloc, exec->objects); - vk_free(alloc, exec->bos); - vk_free(alloc, exec->fences); - vk_free(alloc, exec->syncobjs); + vk_free(exec->alloc, exec->objects); + vk_free(exec->alloc, exec->bos); } -static int -_compare_bo_handles(const void *_bo1, const void *_bo2) +static void +anv_execbuf_add_ext(struct anv_execbuf *exec, + uint32_t ext_name, + struct i915_user_extension *ext) { - struct anv_bo * const *bo1 = _bo1; - struct anv_bo * const *bo2 = _bo2; + __u64 *iter = &exec->execbuf.cliprects_ptr; + + exec->execbuf.flags |= I915_EXEC_USE_EXTENSIONS; + + while (*iter != 0) { + iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension; + } - return (*bo1)->gem_handle - (*bo2)->gem_handle; + ext->name = ext_name; + + *iter = (uintptr_t) ext; } static VkResult -anv_execbuf_add_bo(struct anv_execbuf *exec, +anv_execbuf_add_bo_bitset(struct anv_device *device, + struct anv_execbuf *exec, + uint32_t dep_words, + BITSET_WORD *deps, + uint32_t extra_flags); + +static VkResult +anv_execbuf_add_bo(struct anv_device *device, + struct anv_execbuf *exec, struct anv_bo *bo, struct anv_reloc_list *relocs, - uint32_t extra_flags, - const VkAllocationCallbacks *alloc) + uint32_t extra_flags) { struct drm_i915_gem_exec_object2 *obj = NULL; + bo = anv_bo_unwrap(bo); + if (bo->index < exec->bo_count && exec->bos[bo->index] == bo) obj = &exec->objects[bo->index]; @@ -1058,16 +1173,14 @@ anv_execbuf_add_bo(struct anv_execbuf *exec, uint32_t new_len = exec->objects ? exec->array_length * 2 : 64; struct drm_i915_gem_exec_object2 *new_objects = - vk_alloc(alloc, new_len * sizeof(*new_objects), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + vk_alloc(exec->alloc, new_len * sizeof(*new_objects), 8, exec->alloc_scope); if (new_objects == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); struct anv_bo **new_bos = - vk_alloc(alloc, new_len * sizeof(*new_bos), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + vk_alloc(exec->alloc, new_len * sizeof(*new_bos), 8, exec->alloc_scope); if (new_bos == NULL) { - vk_free(alloc, new_objects); + vk_free(exec->alloc, new_objects); return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } @@ -1078,8 +1191,8 @@ anv_execbuf_add_bo(struct anv_execbuf *exec, exec->bo_count * sizeof(*new_bos)); } - vk_free(alloc, exec->objects); - vk_free(alloc, exec->bos); + vk_free(exec->alloc, exec->objects); + vk_free(exec->alloc, exec->bos); exec->objects = new_objects; exec->bos = new_bos; @@ -1097,11 +1210,16 @@ anv_execbuf_add_bo(struct anv_execbuf *exec, obj->relocs_ptr = 0; obj->alignment = 0; obj->offset = bo->offset; - obj->flags = (bo->flags & ~ANV_BO_FLAG_MASK) | extra_flags; + obj->flags = bo->flags | extra_flags; obj->rsvd1 = 0; obj->rsvd2 = 0; } + if (extra_flags & EXEC_OBJECT_WRITE) { + obj->flags |= EXEC_OBJECT_WRITE; + obj->flags &= ~EXEC_OBJECT_ASYNC; + } + if (relocs != NULL) { assert(obj->relocation_count == 0); @@ -1119,73 +1237,42 @@ anv_execbuf_add_bo(struct anv_execbuf *exec, /* A quick sanity check on relocations */ assert(relocs->relocs[i].offset < bo->size); - result = anv_execbuf_add_bo(exec, relocs->reloc_bos[i], NULL, - extra_flags, alloc); - + result = anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i], + NULL, extra_flags); if (result != VK_SUCCESS) return result; } } - if (relocs->deps && relocs->deps->entries > 0) { - const uint32_t entries = relocs->deps->entries; - struct anv_bo **bos = - vk_alloc(alloc, entries * sizeof(*bos), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (bos == NULL) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - - struct set_entry *entry; - struct anv_bo **bo = bos; - set_foreach(relocs->deps, entry) { - *bo++ = (void *)entry->key; - } - - qsort(bos, entries, sizeof(struct anv_bo*), _compare_bo_handles); - - VkResult result = VK_SUCCESS; - for (bo = bos; bo < bos + entries; bo++) { - result = anv_execbuf_add_bo(exec, *bo, NULL, extra_flags, alloc); - if (result != VK_SUCCESS) - break; - } - - vk_free(alloc, bos); - - if (result != VK_SUCCESS) - return result; - } + return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words, + relocs->deps, extra_flags); } return VK_SUCCESS; } +/* Add BO dependencies to execbuf */ static VkResult -anv_execbuf_add_syncobj(struct anv_execbuf *exec, - uint32_t handle, uint32_t flags, - const VkAllocationCallbacks *alloc) +anv_execbuf_add_bo_bitset(struct anv_device *device, + struct anv_execbuf *exec, + uint32_t dep_words, + BITSET_WORD *deps, + uint32_t extra_flags) { - assert(flags != 0); - - if (exec->fence_count >= exec->fence_array_length) { - uint32_t new_len = MAX2(exec->fence_array_length * 2, 64); - - exec->fences = vk_realloc(alloc, exec->fences, - new_len * sizeof(*exec->fences), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (exec->fences == NULL) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - - exec->fence_array_length = new_len; + for (uint32_t w = 0; w < dep_words; w++) { + BITSET_WORD mask = deps[w]; + while (mask) { + int i = u_bit_scan(&mask); + uint32_t gem_handle = w * BITSET_WORDBITS + i; + struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle); + assert(bo->refcount > 0); + VkResult result = + anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags); + if (result != VK_SUCCESS) + return result; + } } - exec->fences[exec->fence_count] = (struct drm_i915_gem_exec_fence) { - .handle = handle, - .flags = flags, - }; - - exec->fence_count++; - return VK_SUCCESS; } @@ -1194,7 +1281,7 @@ anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer, struct anv_reloc_list *list) { for (size_t i = 0; i < list->num_relocs; i++) - list->relocs[i].target_handle = list->reloc_bos[i]->index; + list->relocs[i].target_handle = anv_bo_unwrap(list->reloc_bos[i])->index; } static void @@ -1221,6 +1308,7 @@ adjust_relocations_to_state_pool(struct anv_state_pool *pool, struct anv_reloc_list *relocs, uint32_t last_pool_center_bo_offset) { + assert(!from_bo->is_wrapper); assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset); uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset; @@ -1231,7 +1319,7 @@ adjust_relocations_to_state_pool(struct anv_state_pool *pool, * relocations that point to the pool bo with the correct offset. */ for (size_t i = 0; i < relocs->num_relocs; i++) { - if (relocs->reloc_bos[i] == &pool->block_pool.bo) { + if (relocs->reloc_bos[i] == pool->block_pool.bo) { /* Adjust the delta value in the relocation to correctly * correspond to the new delta. Initially, this value may have * been negative (if treated as unsigned), but we trust in @@ -1259,8 +1347,10 @@ anv_reloc_list_apply(struct anv_device *device, struct anv_bo *bo, bool always_relocate) { + bo = anv_bo_unwrap(bo); + for (size_t i = 0; i < list->num_relocs; i++) { - struct anv_bo *target_bo = list->reloc_bos[i]; + struct anv_bo *target_bo = anv_bo_unwrap(list->reloc_bos[i]); if (list->relocs[i].presumed_offset == target_bo->offset && !always_relocate) continue; @@ -1329,6 +1419,7 @@ relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1. */ for (uint32_t i = 0; i < exec->bo_count; i++) { + assert(!exec->bos[i]->is_wrapper); if (exec->bos[i]->offset == (uint64_t)-1) return false; } @@ -1338,8 +1429,10 @@ relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, * what address is actually written in the surface state object at any * given time. The only option is to always relocate them. */ + struct anv_bo *surface_state_bo = + anv_bo_unwrap(cmd_buffer->device->surface_state_pool.block_pool.bo); anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs, - &cmd_buffer->device->surface_state_pool.block_pool.bo, + surface_state_bo, true /* always relocate surface states */); /* Since we own all of the batch buffers, we know what values are stored @@ -1349,7 +1442,7 @@ relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, struct anv_batch_bo **bbo; u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { anv_reloc_list_apply(cmd_buffer->device, - &(*bbo)->relocs, &(*bbo)->bo, false); + &(*bbo)->relocs, (*bbo)->bo, false); } for (uint32_t i = 0; i < exec->bo_count; i++) @@ -1368,22 +1461,75 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs, cmd_buffer->last_ss_pool_center); - VkResult result = anv_execbuf_add_bo(execbuf, &ss_pool->block_pool.bo, - &cmd_buffer->surface_relocs, 0, - &cmd_buffer->device->alloc); - if (result != VK_SUCCESS) - return result; + VkResult result; + if (cmd_buffer->device->physical->use_softpin) { + anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) { + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + /* Add surface dependencies (BOs) to the execbuf */ + anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf, + cmd_buffer->surface_relocs.dep_words, + cmd_buffer->surface_relocs.deps, 0); + + /* Add the BOs for all memory objects */ + list_for_each_entry(struct anv_device_memory, mem, + &cmd_buffer->device->memory_objects, link) { + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + mem->bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + struct anv_block_pool *pool; + pool = &cmd_buffer->device->dynamic_state_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + pool = &cmd_buffer->device->instruction_state_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + pool = &cmd_buffer->device->binding_table_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + } else { + /* Since we aren't in the softpin case, all of our STATE_BASE_ADDRESS BOs + * will get added automatically by processing relocations on the batch + * buffer. We have to add the surface state BO manually because it has + * relocations of its own that we need to be sure are processsed. + */ + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + ss_pool->block_pool.bo, + &cmd_buffer->surface_relocs, 0); + if (result != VK_SUCCESS) + return result; + } /* First, we walk over all of the bos we've seen and add them and their * relocations to the validate list. */ struct anv_batch_bo **bbo; u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { - adjust_relocations_to_state_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs, + adjust_relocations_to_state_pool(ss_pool, (*bbo)->bo, &(*bbo)->relocs, cmd_buffer->last_ss_pool_center); - result = anv_execbuf_add_bo(execbuf, &(*bbo)->bo, &(*bbo)->relocs, 0, - &cmd_buffer->device->alloc); + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + (*bbo)->bo, &(*bbo)->relocs, 0); if (result != VK_SUCCESS) return result; } @@ -1402,24 +1548,24 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, * corresponding to the first batch_bo in the chain with the last * element in the list. */ - if (first_batch_bo->bo.index != execbuf->bo_count - 1) { - uint32_t idx = first_batch_bo->bo.index; + if (first_batch_bo->bo->index != execbuf->bo_count - 1) { + uint32_t idx = first_batch_bo->bo->index; uint32_t last_idx = execbuf->bo_count - 1; struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx]; - assert(execbuf->bos[idx] == &first_batch_bo->bo); + assert(execbuf->bos[idx] == first_batch_bo->bo); execbuf->objects[idx] = execbuf->objects[last_idx]; execbuf->bos[idx] = execbuf->bos[last_idx]; execbuf->bos[idx]->index = idx; execbuf->objects[last_idx] = tmp_obj; - execbuf->bos[last_idx] = &first_batch_bo->bo; - first_batch_bo->bo.index = last_idx; + execbuf->bos[last_idx] = first_batch_bo->bo; + first_batch_bo->bo->index = last_idx; } /* If we are pinning our BOs, we shouldn't have to relocate anything */ - if (cmd_buffer->device->instance->physicalDevice.use_softpin) + if (cmd_buffer->device->physical->use_softpin) assert(!execbuf->has_relocs); /* Now we go through and fixup all of the relocation lists to point to @@ -1437,7 +1583,7 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, __builtin_ia32_mfence(); u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE) - __builtin_ia32_clflush((*bbo)->bo.map + i); + __builtin_ia32_clflush((*bbo)->bo->map + i); } } @@ -1501,8 +1647,9 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, static VkResult setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_device *device) { - VkResult result = anv_execbuf_add_bo(execbuf, &device->trivial_batch_bo, - NULL, 0, &device->alloc); + VkResult result = anv_execbuf_add_bo(device, execbuf, + device->trivial_batch_bo, + NULL, 0); if (result != VK_SUCCESS) return result; @@ -1511,7 +1658,7 @@ setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_device *device) .buffer_count = execbuf->bo_count, .batch_start_offset = 0, .batch_len = 8, /* GEN7_MI_BATCH_BUFFER_END and NOOP */ - .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER, + .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER | I915_EXEC_NO_RELOC, .rsvd1 = device->context_id, .rsvd2 = 0, }; @@ -1519,225 +1666,206 @@ setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_device *device) return VK_SUCCESS; } +/* We lock around execbuf for three main reasons: + * + * 1) When a block pool is resized, we create a new gem handle with a + * different size and, in the case of surface states, possibly a different + * center offset but we re-use the same anv_bo struct when we do so. If + * this happens in the middle of setting up an execbuf, we could end up + * with our list of BOs out of sync with our list of gem handles. + * + * 2) The algorithm we use for building the list of unique buffers isn't + * thread-safe. While the client is supposed to syncronize around + * QueueSubmit, this would be extremely difficult to debug if it ever came + * up in the wild due to a broken app. It's better to play it safe and + * just lock around QueueSubmit. + * + * 3) The anv_cmd_buffer_execbuf function may perform relocations in + * userspace. Due to the fact that the surface state buffer is shared + * between batches, we can't afford to have that happen from multiple + * threads at the same time. Even though the user is supposed to ensure + * this doesn't happen, we play it safe as in (2) above. + * + * Since the only other things that ever take the device lock such as block + * pool resize only rarely happen, this will almost never be contended so + * taking a lock isn't really an expensive operation in this case. + */ VkResult -anv_cmd_buffer_execbuf(struct anv_device *device, - struct anv_cmd_buffer *cmd_buffer, - const VkSemaphore *in_semaphores, - uint32_t num_in_semaphores, - const VkSemaphore *out_semaphores, - uint32_t num_out_semaphores, - VkFence _fence) +anv_queue_execbuf_locked(struct anv_queue *queue, + struct anv_queue_submit *submit) { - ANV_FROM_HANDLE(anv_fence, fence, _fence); - + struct anv_device *device = queue->device; struct anv_execbuf execbuf; anv_execbuf_init(&execbuf); + execbuf.alloc = submit->alloc; + execbuf.alloc_scope = submit->alloc_scope; + execbuf.perf_query_pass = submit->perf_query_pass; - int in_fence = -1; - VkResult result = VK_SUCCESS; - for (uint32_t i = 0; i < num_in_semaphores; i++) { - ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); - struct anv_semaphore_impl *impl = - semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? - &semaphore->temporary : &semaphore->permanent; - - switch (impl->type) { - case ANV_SEMAPHORE_TYPE_BO: - result = anv_execbuf_add_bo(&execbuf, impl->bo, NULL, - 0, &device->alloc); - if (result != VK_SUCCESS) - return result; - break; - - case ANV_SEMAPHORE_TYPE_SYNC_FILE: - if (in_fence == -1) { - in_fence = impl->fd; - } else { - int merge = anv_gem_sync_file_merge(device, in_fence, impl->fd); - if (merge == -1) - return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); - - close(impl->fd); - close(in_fence); - in_fence = merge; - } - - impl->fd = -1; - break; + /* Always add the workaround BO as it includes a driver identifier for the + * error_state. + */ + VkResult result = + anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0); + if (result != VK_SUCCESS) + goto error; - case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: - result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, - I915_EXEC_FENCE_WAIT, - &device->alloc); - if (result != VK_SUCCESS) - return result; - break; + for (uint32_t i = 0; i < submit->fence_bo_count; i++) { + int signaled; + struct anv_bo *bo = anv_unpack_ptr(submit->fence_bos[i], 1, &signaled); - default: - break; - } + result = anv_execbuf_add_bo(device, &execbuf, bo, NULL, + signaled ? EXEC_OBJECT_WRITE : 0); + if (result != VK_SUCCESS) + goto error; } - bool need_out_fence = false; - for (uint32_t i = 0; i < num_out_semaphores; i++) { - ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); - - /* Under most circumstances, out fences won't be temporary. However, - * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: - * - * "If the import is temporary, the implementation must restore the - * semaphore to its prior permanent state after submitting the next - * semaphore wait operation." - * - * The spec says nothing whatsoever about signal operations on - * temporarily imported semaphores so it appears they are allowed. - * There are also CTS tests that require this to work. - */ - struct anv_semaphore_impl *impl = - semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? - &semaphore->temporary : &semaphore->permanent; - - switch (impl->type) { - case ANV_SEMAPHORE_TYPE_BO: - result = anv_execbuf_add_bo(&execbuf, impl->bo, NULL, - EXEC_OBJECT_WRITE, &device->alloc); - if (result != VK_SUCCESS) - return result; - break; - - case ANV_SEMAPHORE_TYPE_SYNC_FILE: - need_out_fence = true; - break; - - case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: - result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, - I915_EXEC_FENCE_SIGNAL, - &device->alloc); - if (result != VK_SUCCESS) - return result; - break; - - default: - break; - } + if (submit->cmd_buffer) { + result = setup_execbuf_for_cmd_buffer(&execbuf, submit->cmd_buffer); + } else if (submit->simple_bo) { + result = anv_execbuf_add_bo(device, &execbuf, submit->simple_bo, NULL, 0); + if (result != VK_SUCCESS) + goto error; + + execbuf.execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf.objects, + .buffer_count = execbuf.bo_count, + .batch_start_offset = 0, + .batch_len = submit->simple_bo_size, + .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER | I915_EXEC_NO_RELOC, + .rsvd1 = device->context_id, + .rsvd2 = 0, + }; + } else { + result = setup_empty_execbuf(&execbuf, queue->device); } - if (fence) { - /* Under most circumstances, out fences won't be temporary. However, - * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: - * - * "If the import is temporary, the implementation must restore the - * semaphore to its prior permanent state after submitting the next - * semaphore wait operation." - * - * The spec says nothing whatsoever about signal operations on - * temporarily imported semaphores so it appears they are allowed. - * There are also CTS tests that require this to work. - */ - struct anv_fence_impl *impl = - fence->temporary.type != ANV_FENCE_TYPE_NONE ? - &fence->temporary : &fence->permanent; - - switch (impl->type) { - case ANV_FENCE_TYPE_BO: - result = anv_execbuf_add_bo(&execbuf, &impl->bo.bo, NULL, - EXEC_OBJECT_WRITE, &device->alloc); - if (result != VK_SUCCESS) - return result; - break; - - case ANV_FENCE_TYPE_SYNCOBJ: - result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, - I915_EXEC_FENCE_SIGNAL, - &device->alloc); - if (result != VK_SUCCESS) - return result; - break; + if (result != VK_SUCCESS) + goto error; + + const bool has_perf_query = + submit->perf_query_pass >= 0 && + submit->cmd_buffer && + submit->cmd_buffer->perf_query_pool; + + if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) { + if (submit->cmd_buffer) { + if (has_perf_query) { + struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool; + struct anv_bo *pass_batch_bo = query_pool->bo; + uint64_t pass_batch_offset = + khr_perf_query_preamble_offset(query_pool, + submit->perf_query_pass); + + gen_print_batch(&device->decoder_ctx, + pass_batch_bo->map + pass_batch_offset, 64, + pass_batch_bo->offset + pass_batch_offset, false); + } - default: - unreachable("Invalid fence type"); + struct anv_batch_bo **bo = u_vector_tail(&submit->cmd_buffer->seen_bbos); + device->cmd_buffer_being_decoded = submit->cmd_buffer; + gen_print_batch(&device->decoder_ctx, (*bo)->bo->map, + (*bo)->bo->size, (*bo)->bo->offset, false); + device->cmd_buffer_being_decoded = NULL; + } else if (submit->simple_bo) { + gen_print_batch(&device->decoder_ctx, submit->simple_bo->map, + submit->simple_bo->size, submit->simple_bo->offset, false); + } else { + gen_print_batch(&device->decoder_ctx, + device->trivial_batch_bo->map, + device->trivial_batch_bo->size, + device->trivial_batch_bo->offset, false); } } - if (cmd_buffer) - result = setup_execbuf_for_cmd_buffer(&execbuf, cmd_buffer); - else - result = setup_empty_execbuf(&execbuf, device); - - if (result != VK_SUCCESS) - return result; - - if (execbuf.fence_count > 0) { - assert(device->instance->physicalDevice.has_syncobj); - execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY; - execbuf.execbuf.num_cliprects = execbuf.fence_count; - execbuf.execbuf.cliprects_ptr = (uintptr_t) execbuf.fences; + if (submit->fence_count > 0) { + assert(device->physical->has_syncobj); + if (device->has_thread_submit) { + execbuf.timeline_fences.fence_count = submit->fence_count; + execbuf.timeline_fences.handles_ptr = (uintptr_t)submit->fences; + execbuf.timeline_fences.values_ptr = (uintptr_t)submit->fence_values; + anv_execbuf_add_ext(&execbuf, + DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES, + &execbuf.timeline_fences.base); + } else { + execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY; + execbuf.execbuf.num_cliprects = submit->fence_count; + execbuf.execbuf.cliprects_ptr = (uintptr_t)submit->fences; + } } - if (in_fence != -1) { + if (submit->in_fence != -1) { + assert(!device->has_thread_submit); execbuf.execbuf.flags |= I915_EXEC_FENCE_IN; - execbuf.execbuf.rsvd2 |= (uint32_t)in_fence; + execbuf.execbuf.rsvd2 |= (uint32_t)submit->in_fence; } - if (need_out_fence) + if (submit->need_out_fence) { + assert(!device->has_thread_submit); execbuf.execbuf.flags |= I915_EXEC_FENCE_OUT; + } - result = anv_device_execbuf(device, &execbuf.execbuf, execbuf.bos); - - /* Execbuf does not consume the in_fence. It's our job to close it. */ - if (in_fence != -1) - close(in_fence); + if (has_perf_query) { + struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool; + assert(submit->perf_query_pass < query_pool->n_passes); + struct gen_perf_query_info *query_info = + query_pool->pass_query[submit->perf_query_pass]; - for (uint32_t i = 0; i < num_in_semaphores; i++) { - ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); - /* From the Vulkan 1.0.53 spec: - * - * "If the import is temporary, the implementation must restore the - * semaphore to its prior permanent state after submitting the next - * semaphore wait operation." - * - * This has to happen after the execbuf in case we close any syncobjs in - * the process. + /* Some performance queries just the pipeline statistic HW, no need for + * OA in that case, so no need to reconfigure. */ - anv_semaphore_reset_temporary(device, semaphore); + if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) && + (query_info->kind == GEN_PERF_QUERY_TYPE_OA || + query_info->kind == GEN_PERF_QUERY_TYPE_RAW)) { + int ret = gen_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG, + (void *)(uintptr_t) query_info->oa_metrics_set_id); + if (ret < 0) { + result = anv_device_set_lost(device, + "i915-perf config failed: %s", + strerror(ret)); + } + } + + struct anv_bo *pass_batch_bo = query_pool->bo; + + struct drm_i915_gem_exec_object2 query_pass_object = { + .handle = pass_batch_bo->gem_handle, + .offset = pass_batch_bo->offset, + .flags = pass_batch_bo->flags, + }; + struct drm_i915_gem_execbuffer2 query_pass_execbuf = { + .buffers_ptr = (uintptr_t) &query_pass_object, + .buffer_count = 1, + .batch_start_offset = khr_perf_query_preamble_offset(query_pool, + submit->perf_query_pass), + .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER, + .rsvd1 = device->context_id, + }; + + int ret = queue->device->no_hw ? 0 : + anv_gem_execbuffer(queue->device, &query_pass_execbuf); + if (ret) + result = anv_queue_set_lost(queue, "execbuf2 failed: %m"); } - if (fence && fence->permanent.type == ANV_FENCE_TYPE_BO) { - /* BO fences can't be shared, so they can't be temporary. */ - assert(fence->temporary.type == ANV_FENCE_TYPE_NONE); + int ret = queue->device->no_hw ? 0 : + anv_gem_execbuffer(queue->device, &execbuf.execbuf); + if (ret) + result = anv_queue_set_lost(queue, "execbuf2 failed: %m"); - /* Once the execbuf has returned, we need to set the fence state to - * SUBMITTED. We can't do this before calling execbuf because - * anv_GetFenceStatus does take the global device lock before checking - * fence->state. - * - * We set the fence state to SUBMITTED regardless of whether or not the - * execbuf succeeds because we need to ensure that vkWaitForFences() and - * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or - * VK_SUCCESS) in a finite amount of time even if execbuf fails. - */ - fence->permanent.bo.state = ANV_BO_FENCE_STATE_SUBMITTED; + struct drm_i915_gem_exec_object2 *objects = execbuf.objects; + for (uint32_t k = 0; k < execbuf.bo_count; k++) { + if (execbuf.bos[k]->flags & EXEC_OBJECT_PINNED) + assert(execbuf.bos[k]->offset == objects[k].offset); + execbuf.bos[k]->offset = objects[k].offset; } - if (result == VK_SUCCESS && need_out_fence) { - int out_fence = execbuf.execbuf.rsvd2 >> 32; - for (uint32_t i = 0; i < num_out_semaphores; i++) { - ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); - /* Out fences can't have temporary state because that would imply - * that we imported a sync file and are trying to signal it. - */ - assert(semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE); - struct anv_semaphore_impl *impl = &semaphore->permanent; + if (result == VK_SUCCESS && submit->need_out_fence) + submit->out_fence = execbuf.execbuf.rsvd2 >> 32; - if (impl->type == ANV_SEMAPHORE_TYPE_SYNC_FILE) { - assert(impl->fd == -1); - impl->fd = dup(out_fence); - } - } - close(out_fence); - } + error: + pthread_cond_broadcast(&device->queue_submit); - anv_execbuf_finish(&execbuf, &device->alloc); + anv_execbuf_finish(&execbuf); return result; }