X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_device.c;h=9bc44b84a2ab82f88d8602271da98df2717f9a0f;hb=06ffd299252311f57feac4474551bd5b44d3d4d4;hp=4aa6af2bbdacd6e346afecb68a9a96b34a8f3c22;hpb=f65b3641c3233f1697b96ea8126b578dae6de4f1;p=mesa.git diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 4aa6af2bbda..9bc44b84a2a 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -32,6 +32,7 @@ #include #include #include "radv_private.h" +#include "radv_cs.h" #include "util/strtod.h" #include @@ -103,6 +104,10 @@ static const VkExtensionProperties instance_extensions[] = { }; static const VkExtensionProperties common_device_extensions[] = { + { + .extensionName = VK_KHR_MAINTENANCE1_EXTENSION_NAME, + .specVersion = 1, + }, { .extensionName = VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME, .specVersion = 1, @@ -116,7 +121,11 @@ static const VkExtensionProperties common_device_extensions[] = { .specVersion = 1, }, { - .extensionName = VK_AMD_NEGATIVE_VIEWPORT_HEIGHT_EXTENSION_NAME, + .extensionName = VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME, .specVersion = 1, }, }; @@ -285,7 +294,7 @@ static const VkAllocationCallbacks default_alloc = { }; static const struct debug_control radv_debug_options[] = { - {"fastclears", RADV_DEBUG_FAST_CLEARS}, + {"nofastclears", RADV_DEBUG_NO_FAST_CLEARS}, {"nodcc", RADV_DEBUG_NO_DCC}, {"shaders", RADV_DEBUG_DUMP_SHADERS}, {"nocache", RADV_DEBUG_NO_CACHE}, @@ -423,7 +432,7 @@ void radv_GetPhysicalDeviceFeatures( .fullDrawIndexUint32 = true, .imageCubeArray = true, .independentBlend = true, - .geometryShader = false, + .geometryShader = true, .tessellationShader = false, .sampleRateShading = false, .dualSrcBlend = true, @@ -437,7 +446,7 @@ void radv_GetPhysicalDeviceFeatures( .wideLines = true, .largePoints = true, .alphaToOne = true, - .multiViewport = false, + .multiViewport = true, .samplerAnisotropy = true, .textureCompressionETC2 = false, .textureCompressionASTC_LDR = false, @@ -458,7 +467,7 @@ void radv_GetPhysicalDeviceFeatures( .shaderStorageImageWriteWithoutFormat = false, .shaderClipDistance = true, .shaderCullDistance = true, - .shaderFloat64 = false, + .shaderFloat64 = true, .shaderInt64 = false, .shaderInt16 = false, .alphaToOne = true, @@ -467,6 +476,13 @@ void radv_GetPhysicalDeviceFeatures( }; } +void radv_GetPhysicalDeviceFeatures2KHR( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceFeatures2KHR *pFeatures) +{ + return radv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features); +} + void radv_GetPhysicalDeviceProperties( VkPhysicalDevice physicalDevice, VkPhysicalDeviceProperties* pProperties) @@ -600,12 +616,18 @@ void radv_GetPhysicalDeviceProperties( memcpy(pProperties->pipelineCacheUUID, pdevice->uuid, VK_UUID_SIZE); } -void radv_GetPhysicalDeviceQueueFamilyProperties( +void radv_GetPhysicalDeviceProperties2KHR( VkPhysicalDevice physicalDevice, + VkPhysicalDeviceProperties2KHR *pProperties) +{ + return radv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties); +} + +static void radv_get_physical_device_queue_family_properties( + struct radv_physical_device* pdevice, uint32_t* pCount, - VkQueueFamilyProperties* pQueueFamilyProperties) + VkQueueFamilyProperties** pQueueFamilyProperties) { - RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); int num_queue_families = 1; int idx; if (pdevice->rad_info.compute_rings > 0 && @@ -623,7 +645,7 @@ void radv_GetPhysicalDeviceQueueFamilyProperties( idx = 0; if (*pCount >= 1) { - pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { + *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { .queueFlags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT, @@ -638,7 +660,7 @@ void radv_GetPhysicalDeviceQueueFamilyProperties( pdevice->rad_info.chip_class >= CIK && !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) { if (*pCount > idx) { - pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { + *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { .queueFlags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT, .queueCount = pdevice->rad_info.compute_rings, .timestampValidBits = 64, @@ -650,9 +672,47 @@ void radv_GetPhysicalDeviceQueueFamilyProperties( *pCount = idx; } +void radv_GetPhysicalDeviceQueueFamilyProperties( + VkPhysicalDevice physicalDevice, + uint32_t* pCount, + VkQueueFamilyProperties* pQueueFamilyProperties) +{ + RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); + if (!pQueueFamilyProperties) { + return radv_get_physical_device_queue_family_properties(pdevice, pCount, NULL); + return; + } + VkQueueFamilyProperties *properties[] = { + pQueueFamilyProperties + 0, + pQueueFamilyProperties + 1, + pQueueFamilyProperties + 2, + }; + radv_get_physical_device_queue_family_properties(pdevice, pCount, properties); + assert(*pCount <= 3); +} + +void radv_GetPhysicalDeviceQueueFamilyProperties2KHR( + VkPhysicalDevice physicalDevice, + uint32_t* pCount, + VkQueueFamilyProperties2KHR *pQueueFamilyProperties) +{ + RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); + if (!pQueueFamilyProperties) { + return radv_get_physical_device_queue_family_properties(pdevice, pCount, NULL); + return; + } + VkQueueFamilyProperties *properties[] = { + &pQueueFamilyProperties[0].queueFamilyProperties, + &pQueueFamilyProperties[1].queueFamilyProperties, + &pQueueFamilyProperties[2].queueFamilyProperties, + }; + radv_get_physical_device_queue_family_properties(pdevice, pCount, properties); + assert(*pCount <= 3); +} + void radv_GetPhysicalDeviceMemoryProperties( VkPhysicalDevice physicalDevice, - VkPhysicalDeviceMemoryProperties* pMemoryProperties) + VkPhysicalDeviceMemoryProperties *pMemoryProperties) { RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice); @@ -699,6 +759,14 @@ void radv_GetPhysicalDeviceMemoryProperties( }; } +void radv_GetPhysicalDeviceMemoryProperties2KHR( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceMemoryProperties2KHR *pMemoryProperties) +{ + return radv_GetPhysicalDeviceMemoryProperties(physicalDevice, + &pMemoryProperties->memoryProperties); +} + static int radv_queue_init(struct radv_device *device, struct radv_queue *queue, int queue_family_index, int idx) @@ -720,6 +788,49 @@ radv_queue_finish(struct radv_queue *queue) { if (queue->hw_ctx) queue->device->ws->ctx_destroy(queue->hw_ctx); + + if (queue->preamble_cs) + queue->device->ws->cs_destroy(queue->preamble_cs); + if (queue->descriptor_bo) + queue->device->ws->buffer_destroy(queue->descriptor_bo); + if (queue->scratch_bo) + queue->device->ws->buffer_destroy(queue->scratch_bo); + if (queue->esgs_ring_bo) + queue->device->ws->buffer_destroy(queue->esgs_ring_bo); + if (queue->gsvs_ring_bo) + queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); + if (queue->compute_scratch_bo) + queue->device->ws->buffer_destroy(queue->compute_scratch_bo); +} + +static void +radv_device_init_gs_info(struct radv_device *device) +{ + switch (device->physical_device->rad_info.family) { + case CHIP_OLAND: + case CHIP_HAINAN: + case CHIP_KAVERI: + case CHIP_KABINI: + case CHIP_MULLINS: + case CHIP_ICELAND: + case CHIP_CARRIZO: + case CHIP_STONEY: + device->gs_table_depth = 16; + return; + case CHIP_TAHITI: + case CHIP_PITCAIRN: + case CHIP_VERDE: + case CHIP_BONAIRE: + case CHIP_HAWAII: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + device->gs_table_depth = 32; + return; + default: + unreachable("unknown GPU"); + } } VkResult radv_CreateDevice( @@ -781,6 +892,30 @@ VkResult radv_CreateDevice( } } +#if HAVE_LLVM < 0x0400 + device->llvm_supports_spill = false; +#else + device->llvm_supports_spill = true; +#endif + + /* The maximum number of scratch waves. Scratch space isn't divided + * evenly between CUs. The number is only a function of the number of CUs. + * We can decrease the constant to decrease the scratch buffer size. + * + * sctx->scratch_waves must be >= the maximum posible size of + * 1 threadgroup, so that the hw doesn't hang from being unable + * to start any. + * + * The recommended value is 4 per CU at most. Higher numbers don't + * bring much benefit, but they still occupy chip resources (think + * async compute). I've seen ~2% performance difference between 4 and 32. + */ + uint32_t max_threads_per_block = 2048; + device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units, + max_threads_per_block / 64); + + radv_device_init_gs_info(device); + result = radv_device_init_meta(device); if (result != VK_SUCCESS) goto fail; @@ -814,6 +949,10 @@ VkResult radv_CreateDevice( goto fail; } + /* temporarily disabled on CIK */ + if (device->physical_device->rad_info.chip_class > CIK) + cik_create_gfx_config(device); + *pDevice = radv_device_to_handle(device); return VK_SUCCESS; @@ -821,6 +960,9 @@ fail: if (device->trace_bo) device->ws->buffer_destroy(device->trace_bo); + if (device->gfx_init) + device->ws->buffer_destroy(device->gfx_init); + for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) { for (unsigned q = 0; q < device->queue_count[i]; q++) radv_queue_finish(&device->queues[i][q]); @@ -841,6 +983,9 @@ void radv_DestroyDevice( if (device->trace_bo) device->ws->buffer_destroy(device->trace_bo); + if (device->gfx_init) + device->ws->buffer_destroy(device->gfx_init); + for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) { for (unsigned q = 0; q < device->queue_count[i]; q++) radv_queue_finish(&device->queues[i][q]); @@ -946,6 +1091,335 @@ static void radv_dump_trace(struct radv_device *device, fclose(f); } +static void +fill_geom_rings(struct radv_queue *queue, + uint32_t *map, + uint32_t esgs_ring_size, + struct radeon_winsys_bo *esgs_ring_bo, + uint32_t gsvs_ring_size, + struct radeon_winsys_bo *gsvs_ring_bo) +{ + uint64_t esgs_va = 0, gsvs_va = 0; + uint32_t *desc = &map[4]; + + if (esgs_ring_bo) + esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo); + if (gsvs_ring_bo) + gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo); + + /* stride 0, num records - size, add tid, swizzle, elsize4, + index stride 64 */ + desc[0] = esgs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32) | + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(true); + desc[2] = esgs_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(1) | + S_008F0C_INDEX_STRIDE(3) | + S_008F0C_ADD_TID_ENABLE(true); + + desc += 4; + /* GS entry for ES->GS ring */ + /* stride 0, num records - size, elsize0, + index stride 0 */ + desc[0] = esgs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32)| + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(false); + desc[2] = esgs_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(0) | + S_008F0C_INDEX_STRIDE(0) | + S_008F0C_ADD_TID_ENABLE(false); + + desc += 4; + /* VS entry for GS->VS ring */ + /* stride 0, num records - size, elsize0, + index stride 0 */ + desc[0] = gsvs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)| + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(false); + desc[2] = gsvs_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(0) | + S_008F0C_INDEX_STRIDE(0) | + S_008F0C_ADD_TID_ENABLE(false); + desc += 4; + + /* stride gsvs_itemsize, num records 64 + elsize 4, index stride 16 */ + /* shader will patch stride and desc[2] */ + desc[0] = gsvs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)| + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(true); + desc[2] = 0; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(1) | + S_008F0C_INDEX_STRIDE(1) | + S_008F0C_ADD_TID_ENABLE(true); +} + +static VkResult +radv_get_preamble_cs(struct radv_queue *queue, + uint32_t scratch_size, + uint32_t compute_scratch_size, + uint32_t esgs_ring_size, + uint32_t gsvs_ring_size, + struct radeon_winsys_cs **preamble_cs) +{ + struct radeon_winsys_bo *scratch_bo = NULL; + struct radeon_winsys_bo *descriptor_bo = NULL; + struct radeon_winsys_bo *compute_scratch_bo = NULL; + struct radeon_winsys_bo *esgs_ring_bo = NULL; + struct radeon_winsys_bo *gsvs_ring_bo = NULL; + struct radeon_winsys_cs *cs = NULL; + + if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) { + *preamble_cs = NULL; + return VK_SUCCESS; + } + + if (scratch_size <= queue->scratch_size && + compute_scratch_size <= queue->compute_scratch_size && + esgs_ring_size <= queue->esgs_ring_size && + gsvs_ring_size <= queue->gsvs_ring_size) { + *preamble_cs = queue->preamble_cs; + return VK_SUCCESS; + } + + if (scratch_size > queue->scratch_size) { + scratch_bo = queue->device->ws->buffer_create(queue->device->ws, + scratch_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!scratch_bo) + goto fail; + } else + scratch_bo = queue->scratch_bo; + + if (compute_scratch_size > queue->compute_scratch_size) { + compute_scratch_bo = queue->device->ws->buffer_create(queue->device->ws, + compute_scratch_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!compute_scratch_bo) + goto fail; + + } else + compute_scratch_bo = queue->compute_scratch_bo; + + if (esgs_ring_size > queue->esgs_ring_size) { + esgs_ring_bo = queue->device->ws->buffer_create(queue->device->ws, + esgs_ring_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!esgs_ring_bo) + goto fail; + } else { + esgs_ring_bo = queue->esgs_ring_bo; + esgs_ring_size = queue->esgs_ring_size; + } + + if (gsvs_ring_size > queue->gsvs_ring_size) { + gsvs_ring_bo = queue->device->ws->buffer_create(queue->device->ws, + gsvs_ring_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!gsvs_ring_bo) + goto fail; + } else { + gsvs_ring_bo = queue->gsvs_ring_bo; + gsvs_ring_size = queue->gsvs_ring_size; + } + + if (scratch_bo != queue->scratch_bo || + esgs_ring_bo != queue->esgs_ring_bo || + gsvs_ring_bo != queue->gsvs_ring_bo) { + uint32_t size = 0; + if (gsvs_ring_bo || esgs_ring_bo) + size = 80; /* 2 dword + 2 padding + 4 dword * 4 */ + else if (scratch_bo) + size = 8; /* 2 dword */ + + descriptor_bo = queue->device->ws->buffer_create(queue->device->ws, + size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_CPU_ACCESS); + if (!descriptor_bo) + goto fail; + } else + descriptor_bo = queue->descriptor_bo; + + cs = queue->device->ws->cs_create(queue->device->ws, + queue->queue_family_index ? RING_COMPUTE : RING_GFX); + if (!cs) + goto fail; + + + if (scratch_bo) + queue->device->ws->cs_add_buffer(cs, scratch_bo, 8); + + if (esgs_ring_bo) + queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8); + + if (gsvs_ring_bo) + queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8); + + if (descriptor_bo) + queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8); + + if (descriptor_bo != queue->descriptor_bo) { + uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo); + + if (scratch_bo) { + uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo); + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1); + map[0] = scratch_va; + map[1] = rsrc1; + } + + if (esgs_ring_bo || gsvs_ring_bo) + fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo); + + queue->device->ws->buffer_unmap(descriptor_bo); + } + + if (esgs_ring_bo || gsvs_ring_bo) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + + if (queue->device->physical_device->rad_info.chip_class >= CIK) { + radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2); + radeon_emit(cs, esgs_ring_size >> 8); + radeon_emit(cs, gsvs_ring_size >> 8); + } else { + radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2); + radeon_emit(cs, esgs_ring_size >> 8); + radeon_emit(cs, gsvs_ring_size >> 8); + } + } + + if (descriptor_bo) { + uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0, + R_00B130_SPI_SHADER_USER_DATA_VS_0, + R_00B230_SPI_SHADER_USER_DATA_GS_0, + R_00B330_SPI_SHADER_USER_DATA_ES_0, + R_00B430_SPI_SHADER_USER_DATA_HS_0, + R_00B530_SPI_SHADER_USER_DATA_LS_0}; + + uint64_t va = queue->device->ws->buffer_get_va(descriptor_bo); + + for (int i = 0; i < ARRAY_SIZE(regs); ++i) { + radeon_set_sh_reg_seq(cs, regs[i], 2); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + } + } + + if (compute_scratch_bo) { + uint64_t scratch_va = queue->device->ws->buffer_get_va(compute_scratch_bo); + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1); + + queue->device->ws->cs_add_buffer(cs, compute_scratch_bo, 8); + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2); + radeon_emit(cs, scratch_va); + radeon_emit(cs, rsrc1); + } + + if (!queue->device->ws->cs_finalize(cs)) + goto fail; + + if (queue->preamble_cs) + queue->device->ws->cs_destroy(queue->preamble_cs); + + queue->preamble_cs = cs; + + if (scratch_bo != queue->scratch_bo) { + if (queue->scratch_bo) + queue->device->ws->buffer_destroy(queue->scratch_bo); + queue->scratch_bo = scratch_bo; + queue->scratch_size = scratch_size; + } + + if (compute_scratch_bo != queue->compute_scratch_bo) { + if (queue->compute_scratch_bo) + queue->device->ws->buffer_destroy(queue->compute_scratch_bo); + queue->compute_scratch_bo = compute_scratch_bo; + queue->compute_scratch_size = compute_scratch_size; + } + + if (esgs_ring_bo != queue->esgs_ring_bo) { + if (queue->esgs_ring_bo) + queue->device->ws->buffer_destroy(queue->esgs_ring_bo); + queue->esgs_ring_bo = esgs_ring_bo; + queue->esgs_ring_size = esgs_ring_size; + } + + if (gsvs_ring_bo != queue->gsvs_ring_bo) { + if (queue->gsvs_ring_bo) + queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); + queue->gsvs_ring_bo = gsvs_ring_bo; + queue->gsvs_ring_size = gsvs_ring_size; + } + + if (descriptor_bo != queue->descriptor_bo) { + if (queue->descriptor_bo) + queue->device->ws->buffer_destroy(queue->descriptor_bo); + + queue->descriptor_bo = descriptor_bo; + } + + *preamble_cs = cs; + return VK_SUCCESS; +fail: + if (cs) + queue->device->ws->cs_destroy(cs); + if (descriptor_bo && descriptor_bo != queue->descriptor_bo) + queue->device->ws->buffer_destroy(descriptor_bo); + if (scratch_bo && scratch_bo != queue->scratch_bo) + queue->device->ws->buffer_destroy(scratch_bo); + if (compute_scratch_bo && compute_scratch_bo != queue->compute_scratch_bo) + queue->device->ws->buffer_destroy(compute_scratch_bo); + if (esgs_ring_bo && esgs_ring_bo != queue->esgs_ring_bo) + queue->device->ws->buffer_destroy(esgs_ring_bo); + if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo) + queue->device->ws->buffer_destroy(gsvs_ring_bo); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; +} + VkResult radv_QueueSubmit( VkQueue _queue, uint32_t submitCount, @@ -958,38 +1432,93 @@ VkResult radv_QueueSubmit( struct radeon_winsys_ctx *ctx = queue->hw_ctx; int ret; uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX; + uint32_t scratch_size = 0; + uint32_t compute_scratch_size = 0; + uint32_t esgs_ring_size = 0, gsvs_ring_size = 0; + struct radeon_winsys_cs *preamble_cs = NULL; + VkResult result; + bool fence_emitted = false; + + /* Do this first so failing to allocate scratch buffers can't result in + * partially executed submissions. */ + for (uint32_t i = 0; i < submitCount; i++) { + for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, + pSubmits[i].pCommandBuffers[j]); + + scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed); + compute_scratch_size = MAX2(compute_scratch_size, + cmd_buffer->compute_scratch_size_needed); + esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed); + gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed); + } + } + + result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, esgs_ring_size, gsvs_ring_size, &preamble_cs); + if (result != VK_SUCCESS) + return result; for (uint32_t i = 0; i < submitCount; i++) { struct radeon_winsys_cs **cs_array; bool can_patch = true; uint32_t advance; + int draw_cmd_buffers_count = 0; - if (!pSubmits[i].commandBufferCount) + for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, + pSubmits[i].pCommandBuffers[j]); + assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); + if (cmd_buffer->no_draws == true) + continue; + draw_cmd_buffers_count++; + } + + if (!draw_cmd_buffers_count) { + if (pSubmits[i].waitSemaphoreCount || pSubmits[i].signalSemaphoreCount) { + ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, + &queue->device->empty_cs[queue->queue_family_index], + 1, NULL, + (struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores, + pSubmits[i].waitSemaphoreCount, + (struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores, + pSubmits[i].signalSemaphoreCount, + false, base_fence); + if (ret) { + radv_loge("failed to submit CS %d\n", i); + abort(); + } + fence_emitted = true; + } continue; + } - cs_array = malloc(sizeof(struct radeon_winsys_cs *) * - pSubmits[i].commandBufferCount); + cs_array = malloc(sizeof(struct radeon_winsys_cs *) * draw_cmd_buffers_count); + int draw_cmd_buffer_idx = 0; for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pSubmits[i].pCommandBuffers[j]); assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); + if (cmd_buffer->no_draws == true) + continue; - cs_array[j] = cmd_buffer->cs; + cs_array[draw_cmd_buffer_idx] = cmd_buffer->cs; + draw_cmd_buffer_idx++; if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) can_patch = false; } - for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) { + for (uint32_t j = 0; j < draw_cmd_buffers_count; j += advance) { advance = MIN2(max_cs_submission, - pSubmits[i].commandBufferCount - j); + draw_cmd_buffers_count - j); bool b = j == 0; - bool e = j + advance == pSubmits[i].commandBufferCount; + bool e = j + advance == draw_cmd_buffers_count; if (queue->device->trace_bo) *queue->device->trace_id_ptr = 0; - ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, advance, + ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, + advance, preamble_cs, (struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores, b ? pSubmits[i].waitSemaphoreCount : 0, (struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores, @@ -1000,6 +1529,7 @@ VkResult radv_QueueSubmit( radv_loge("failed to submit CS %d\n", i); abort(); } + fence_emitted = true; if (queue->device->trace_bo) { bool success = queue->device->ws->ctx_wait_idle( queue->hw_ctx, @@ -1017,10 +1547,11 @@ VkResult radv_QueueSubmit( } if (fence) { - if (!submitCount) + if (!fence_emitted) ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, &queue->device->empty_cs[queue->queue_family_index], - 1, NULL, 0, NULL, 0, false, base_fence); + 1, NULL, NULL, 0, NULL, 0, + false, base_fence); fence->submitted = true; } @@ -1121,7 +1652,7 @@ VkResult radv_AllocateMemory( if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_WRITE_COMBINE) flags |= RADEON_FLAG_GTT_WC; - mem->bo = device->ws->buffer_create(device->ws, alloc_size, 32768, + mem->bo = device->ws->buffer_create(device->ws, alloc_size, 65536, domain, flags); if (!mem->bo) { @@ -1608,8 +2139,9 @@ radv_initialise_color_surface(struct radv_device *device, va += iview->image->dcc_offset; cb->cb_dcc_base = va >> 8; + uint32_t max_slice = iview->type == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth : iview->layer_count; cb->cb_color_view = S_028C6C_SLICE_START(iview->base_layer) | - S_028C6C_SLICE_MAX(iview->base_layer + iview->extent.depth - 1); + S_028C6C_SLICE_MAX(iview->base_layer + max_slice - 1); cb->micro_tile_mode = iview->image->surface.micro_tile_mode; pitch_tile_max = level_info->nblk_x / 8 - 1; @@ -1693,7 +2225,7 @@ radv_initialise_color_surface(struct radv_device *device, cb->cb_color_info |= S_028C70_COMPRESSION(1); if (iview->image->cmask.size && - (device->debug_flags & RADV_DEBUG_FAST_CLEARS)) + !(device->debug_flags & RADV_DEBUG_NO_FAST_CLEARS)) cb->cb_color_info |= S_028C70_FAST_CLEAR(1); if (iview->image->surface.dcc_size && level_info->dcc_enabled) @@ -1761,8 +2293,9 @@ radv_initialise_ds_surface(struct radv_device *device, z_offs += iview->image->surface.level[level].offset; s_offs += iview->image->surface.stencil_level[level].offset; + uint32_t max_slice = iview->type == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth : iview->layer_count; ds->db_depth_view = S_028008_SLICE_START(iview->base_layer) | - S_028008_SLICE_MAX(iview->base_layer + iview->extent.depth - 1); + S_028008_SLICE_MAX(iview->base_layer + max_slice - 1); ds->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1); ds->db_z_info = S_028040_FORMAT(format) | S_028040_ZRANGE_PRECISION(1);