From: Bas Nieuwenhuizen Date: Sun, 29 Jan 2017 12:53:05 +0000 (+0100) Subject: radv: Handle command buffers that need scratch memory. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=c4d7b9cd290bdedb0e58fa52bf32d39d2411a789;p=mesa.git radv: Handle command buffers that need scratch memory. v2: Create the descriptor BO with CPU access. Signed-off-by: Bas Nieuwenhuizen Reviewed-by: Dave Airlie --- diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index da65511cf15..60714217f01 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -32,6 +32,7 @@ #include #include #include "radv_private.h" +#include "radv_cs.h" #include "util/strtod.h" #include @@ -752,6 +753,15 @@ radv_queue_finish(struct radv_queue *queue) { if (queue->hw_ctx) queue->device->ws->ctx_destroy(queue->hw_ctx); + + if (queue->preamble_cs) + queue->device->ws->cs_destroy(queue->preamble_cs); + if (queue->descriptor_bo) + queue->device->ws->buffer_destroy(queue->descriptor_bo); + if (queue->scratch_bo) + queue->device->ws->buffer_destroy(queue->scratch_bo); + if (queue->compute_scratch_bo) + queue->device->ws->buffer_destroy(queue->compute_scratch_bo); } VkResult radv_CreateDevice( @@ -1000,6 +1010,159 @@ static void radv_dump_trace(struct radv_device *device, fclose(f); } +static VkResult +radv_get_preamble_cs(struct radv_queue *queue, + uint32_t scratch_size, + uint32_t compute_scratch_size, + struct radeon_winsys_cs **preamble_cs) +{ + struct radeon_winsys_bo *scratch_bo = NULL; + struct radeon_winsys_bo *descriptor_bo = NULL; + struct radeon_winsys_bo *compute_scratch_bo = NULL; + struct radeon_winsys_cs *cs = NULL; + + if (!scratch_size && !compute_scratch_size) { + *preamble_cs = NULL; + return VK_SUCCESS; + } + + if (scratch_size <= queue->scratch_size && + compute_scratch_size <= queue->compute_scratch_size) { + *preamble_cs = queue->preamble_cs; + return VK_SUCCESS; + } + + if (scratch_size > queue->scratch_size) { + scratch_bo = queue->device->ws->buffer_create(queue->device->ws, + scratch_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!scratch_bo) + goto fail; + } else + scratch_bo = queue->scratch_bo; + + if (compute_scratch_size > queue->compute_scratch_size) { + compute_scratch_bo = queue->device->ws->buffer_create(queue->device->ws, + compute_scratch_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!compute_scratch_bo) + goto fail; + + } else + compute_scratch_bo = queue->compute_scratch_bo; + + if (scratch_bo != queue->scratch_bo) { + descriptor_bo = queue->device->ws->buffer_create(queue->device->ws, + 8, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_CPU_ACCESS); + if (!descriptor_bo) + goto fail; + } else + descriptor_bo = queue->descriptor_bo; + + cs = queue->device->ws->cs_create(queue->device->ws, + queue->queue_family_index ? RING_COMPUTE : RING_GFX); + if (!cs) + goto fail; + + + if (scratch_bo) + queue->device->ws->cs_add_buffer(cs, scratch_bo, 8); + + if (descriptor_bo) + queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8); + + if (descriptor_bo != queue->descriptor_bo) { + uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo); + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1); + + uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo); + + map[0] = scratch_va; + map[1] = rsrc1; + + queue->device->ws->buffer_unmap(descriptor_bo); + } + + if (descriptor_bo) { + uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0, + R_00B130_SPI_SHADER_USER_DATA_VS_0, + R_00B230_SPI_SHADER_USER_DATA_GS_0, + R_00B330_SPI_SHADER_USER_DATA_ES_0, + R_00B430_SPI_SHADER_USER_DATA_HS_0, + R_00B530_SPI_SHADER_USER_DATA_LS_0}; + + uint64_t va = queue->device->ws->buffer_get_va(descriptor_bo); + + for (int i = 0; i < ARRAY_SIZE(regs); ++i) { + radeon_set_sh_reg_seq(cs, regs[i], 2); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + } + } + + if (compute_scratch_bo) { + uint64_t scratch_va = queue->device->ws->buffer_get_va(compute_scratch_bo); + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1); + + queue->device->ws->cs_add_buffer(cs, compute_scratch_bo, 8); + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2); + radeon_emit(cs, scratch_va); + radeon_emit(cs, rsrc1); + } + + if (!queue->device->ws->cs_finalize(cs)) + goto fail; + + if (queue->preamble_cs) + queue->device->ws->cs_destroy(queue->preamble_cs); + + queue->preamble_cs = cs; + + if (scratch_bo != queue->scratch_bo) { + if (queue->scratch_bo) + queue->device->ws->buffer_destroy(queue->scratch_bo); + queue->scratch_bo = scratch_bo; + queue->scratch_size = scratch_size; + } + + if (compute_scratch_bo != queue->compute_scratch_bo) { + if (queue->compute_scratch_bo) + queue->device->ws->buffer_destroy(queue->compute_scratch_bo); + queue->compute_scratch_bo = compute_scratch_bo; + queue->compute_scratch_size = compute_scratch_size; + } + + if (descriptor_bo != queue->descriptor_bo) { + if (queue->descriptor_bo) + queue->device->ws->buffer_destroy(queue->descriptor_bo); + + queue->descriptor_bo = descriptor_bo; + } + + *preamble_cs = cs; + return VK_SUCCESS; +fail: + if (cs) + queue->device->ws->cs_destroy(cs); + if (descriptor_bo && descriptor_bo != queue->descriptor_bo) + queue->device->ws->buffer_destroy(descriptor_bo); + if (scratch_bo && scratch_bo != queue->scratch_bo) + queue->device->ws->buffer_destroy(scratch_bo); + if (compute_scratch_bo && compute_scratch_bo != queue->compute_scratch_bo) + queue->device->ws->buffer_destroy(compute_scratch_bo); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; +} + VkResult radv_QueueSubmit( VkQueue _queue, uint32_t submitCount, @@ -1012,6 +1175,27 @@ VkResult radv_QueueSubmit( struct radeon_winsys_ctx *ctx = queue->hw_ctx; int ret; uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX; + uint32_t scratch_size = 0; + uint32_t compute_scratch_size = 0; + struct radeon_winsys_cs *preamble_cs = NULL; + VkResult result; + + /* Do this first so failing to allocate scratch buffers can't result in + * partially executed submissions. */ + for (uint32_t i = 0; i < submitCount; i++) { + for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, + pSubmits[i].pCommandBuffers[j]); + + scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed); + compute_scratch_size = MAX2(compute_scratch_size, + cmd_buffer->compute_scratch_size_needed); + } + } + + result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, &preamble_cs); + if (result != VK_SUCCESS) + return result; for (uint32_t i = 0; i < submitCount; i++) { struct radeon_winsys_cs **cs_array; @@ -1044,7 +1228,7 @@ VkResult radv_QueueSubmit( *queue->device->trace_id_ptr = 0; ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, - advance, NULL, + advance, preamble_cs, (struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores, b ? pSubmits[i].waitSemaphoreCount : 0, (struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores, diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index e332877e2ba..f46987fdf6c 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -363,12 +363,13 @@ static void radv_fill_shader_variant(struct radv_device *device, struct ac_shader_binary *binary, gl_shader_stage stage) { - variant->code_size = binary->code_size; bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0; unsigned vgpr_comp_cnt = 0; - if (scratch_enabled) - radv_finishme("shader scratch space"); + if (scratch_enabled && !device->llvm_supports_spill) + radv_finishme("shader scratch support only available with LLVM 4.0"); + + variant->code_size = binary->code_size; switch (stage) { case MESA_SHADER_VERTEX: @@ -433,8 +434,8 @@ static struct radv_shader_variant *radv_shader_variant_create(struct radv_device options.unsafe_math = !!(device->debug_flags & RADV_DEBUG_UNSAFE_MATH); options.family = chip_family; options.chip_class = device->physical_device->rad_info.chip_class; - options.supports_spill = false; - tm = ac_create_target_machine(chip_family, false); + options.supports_spill = device->llvm_supports_spill; + tm = ac_create_target_machine(chip_family, options.supports_spill); ac_compile_nir_shader(tm, &binary, &variant->config, &variant->info, shader, &options, dump); LLVMDisposeTargetMachine(tm); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 88e05595380..fac5b97153d 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -467,6 +467,14 @@ struct radv_queue { struct radeon_winsys_ctx *hw_ctx; int queue_family_index; int queue_idx; + + uint32_t scratch_size; + uint32_t compute_scratch_size; + + struct radeon_winsys_bo *scratch_bo; + struct radeon_winsys_bo *descriptor_bo; + struct radeon_winsys_bo *compute_scratch_bo; + struct radeon_winsys_cs *preamble_cs; }; struct radv_device {