X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_device.c;h=a7997cf9c209c3ba8dc3dfb6245eedf6c38aa83f;hb=b99295fb332bcde5c4168acb5d9d9aede10519e3;hp=05d09bb08eb68102ea87169ec49945198db88a3f;hpb=b9fb90e6d35b19a68a163ab28fba87ab2a950e89;p=mesa.git diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 05d09bb08eb..a7997cf9c20 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -25,16 +25,29 @@ * IN THE SOFTWARE. */ +#include "dirent.h" +#include +#include +#include +#include +#include +#include +#include #include +#include +#include #include +#include +#include #include #include +#include + #include "radv_debug.h" #include "radv_private.h" #include "radv_shader.h" #include "radv_cs.h" #include "util/disk_cache.h" -#include "util/strtod.h" #include "vk_util.h" #include #include @@ -47,9 +60,29 @@ #include "util/build_id.h" #include "util/debug.h" #include "util/mesa-sha1.h" +#include "util/timespec.h" +#include "util/u_atomic.h" #include "compiler/glsl_types.h" #include "util/xmlpool.h" +static struct radv_timeline_point * +radv_timeline_find_point_at_least_locked(struct radv_device *device, + struct radv_timeline *timeline, + uint64_t p); + +static struct radv_timeline_point * +radv_timeline_add_point_locked(struct radv_device *device, + struct radv_timeline *timeline, + uint64_t p); + +static void +radv_timeline_trigger_waiters_locked(struct radv_timeline *timeline, + struct list_head *processing_list); + +static +void radv_destroy_semaphore_part(struct radv_device *device, + struct radv_semaphore_part *part); + static int radv_device_get_cache_uuid(enum radeon_family family, void *uuid) { @@ -84,44 +117,6 @@ radv_get_device_uuid(struct radeon_info *info, void *uuid) ac_compute_device_uuid(info, uuid, VK_UUID_SIZE); } -static void -radv_get_device_name(enum radeon_family family, char *name, size_t name_len) -{ - const char *chip_string; - - switch (family) { - case CHIP_TAHITI: chip_string = "AMD RADV TAHITI"; break; - case CHIP_PITCAIRN: chip_string = "AMD RADV PITCAIRN"; break; - case CHIP_VERDE: chip_string = "AMD RADV CAPE VERDE"; break; - case CHIP_OLAND: chip_string = "AMD RADV OLAND"; break; - case CHIP_HAINAN: chip_string = "AMD RADV HAINAN"; break; - case CHIP_BONAIRE: chip_string = "AMD RADV BONAIRE"; break; - case CHIP_KAVERI: chip_string = "AMD RADV KAVERI"; break; - case CHIP_KABINI: chip_string = "AMD RADV KABINI"; break; - case CHIP_HAWAII: chip_string = "AMD RADV HAWAII"; break; - case CHIP_TONGA: chip_string = "AMD RADV TONGA"; break; - case CHIP_ICELAND: chip_string = "AMD RADV ICELAND"; break; - case CHIP_CARRIZO: chip_string = "AMD RADV CARRIZO"; break; - case CHIP_FIJI: chip_string = "AMD RADV FIJI"; break; - case CHIP_POLARIS10: chip_string = "AMD RADV POLARIS10"; break; - case CHIP_POLARIS11: chip_string = "AMD RADV POLARIS11"; break; - case CHIP_POLARIS12: chip_string = "AMD RADV POLARIS12"; break; - case CHIP_STONEY: chip_string = "AMD RADV STONEY"; break; - case CHIP_VEGAM: chip_string = "AMD RADV VEGA M"; break; - case CHIP_VEGA10: chip_string = "AMD RADV VEGA10"; break; - case CHIP_VEGA12: chip_string = "AMD RADV VEGA12"; break; - case CHIP_VEGA20: chip_string = "AMD RADV VEGA20"; break; - case CHIP_RAVEN: chip_string = "AMD RADV RAVEN"; break; - case CHIP_RAVEN2: chip_string = "AMD RADV RAVEN2"; break; - case CHIP_NAVI10: chip_string = "AMD RADV NAVI10"; break; - case CHIP_NAVI12: chip_string = "AMD RADV NAVI12"; break; - case CHIP_NAVI14: chip_string = "AMD RADV NAVI14"; break; - default: chip_string = "AMD RADV unknown"; break; - } - - snprintf(name, name_len, "%s (LLVM " MESA_LLVM_VERSION_STRING ")", chip_string); -} - static uint64_t radv_get_visible_vram_size(struct radv_physical_device *device) { @@ -134,6 +129,42 @@ radv_get_vram_size(struct radv_physical_device *device) return device->rad_info.vram_size - radv_get_visible_vram_size(device); } +static bool +radv_is_mem_type_vram(enum radv_mem_type type) +{ + return type == RADV_MEM_TYPE_VRAM || + type == RADV_MEM_TYPE_VRAM_UNCACHED; +} + +static bool +radv_is_mem_type_vram_visible(enum radv_mem_type type) +{ + return type == RADV_MEM_TYPE_VRAM_CPU_ACCESS || + type == RADV_MEM_TYPE_VRAM_CPU_ACCESS_UNCACHED; +} +static bool +radv_is_mem_type_gtt_wc(enum radv_mem_type type) +{ + return type == RADV_MEM_TYPE_GTT_WRITE_COMBINE || + type == RADV_MEM_TYPE_GTT_WRITE_COMBINE_VRAM_UNCACHED; +} + +static bool +radv_is_mem_type_gtt_cached(enum radv_mem_type type) +{ + return type == RADV_MEM_TYPE_GTT_CACHED || + type == RADV_MEM_TYPE_GTT_CACHED_VRAM_UNCACHED; +} + +static bool +radv_is_mem_type_uncached(enum radv_mem_type type) +{ + return type == RADV_MEM_TYPE_VRAM_UNCACHED || + type == RADV_MEM_TYPE_VRAM_CPU_ACCESS_UNCACHED || + type == RADV_MEM_TYPE_GTT_WRITE_COMBINE_VRAM_UNCACHED || + type == RADV_MEM_TYPE_GTT_CACHED_VRAM_UNCACHED; +} + static void radv_physical_device_init_mem_types(struct radv_physical_device *device) { @@ -173,12 +204,11 @@ radv_physical_device_init_mem_types(struct radv_physical_device *device) .heapIndex = vram_index, }; } - if (gart_index >= 0) { + if (gart_index >= 0 && device->rad_info.has_dedicated_vram) { device->mem_type_indices[type_count] = RADV_MEM_TYPE_GTT_WRITE_COMBINE; device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) { .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | - (device->rad_info.has_dedicated_vram ? 0 : VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT), + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, .heapIndex = gart_index, }; } @@ -191,6 +221,19 @@ radv_physical_device_init_mem_types(struct radv_physical_device *device) .heapIndex = visible_vram_index, }; } + if (gart_index >= 0 && !device->rad_info.has_dedicated_vram) { + /* Put GTT after visible VRAM for GPUs without dedicated VRAM + * as they have identical property flags, and according to the + * spec, for types with identical flags, the one with greater + * performance must be given a lower index. */ + device->mem_type_indices[type_count] = RADV_MEM_TYPE_GTT_WRITE_COMBINE; + device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) { + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + .heapIndex = gart_index, + }; + } if (gart_index >= 0) { device->mem_type_indices[type_count] = RADV_MEM_TYPE_GTT_CACHED; device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) { @@ -202,6 +245,46 @@ radv_physical_device_init_mem_types(struct radv_physical_device *device) }; } device->memory_properties.memoryTypeCount = type_count; + + if (device->rad_info.has_l2_uncached) { + for (int i = 0; i < device->memory_properties.memoryTypeCount; i++) { + VkMemoryType mem_type = device->memory_properties.memoryTypes[i]; + + if ((mem_type.propertyFlags & (VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) || + mem_type.propertyFlags == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) { + enum radv_mem_type mem_type_id; + + switch (device->mem_type_indices[i]) { + case RADV_MEM_TYPE_VRAM: + mem_type_id = RADV_MEM_TYPE_VRAM_UNCACHED; + break; + case RADV_MEM_TYPE_VRAM_CPU_ACCESS: + mem_type_id = RADV_MEM_TYPE_VRAM_CPU_ACCESS_UNCACHED; + break; + case RADV_MEM_TYPE_GTT_WRITE_COMBINE: + mem_type_id = RADV_MEM_TYPE_GTT_WRITE_COMBINE_VRAM_UNCACHED; + break; + case RADV_MEM_TYPE_GTT_CACHED: + mem_type_id = RADV_MEM_TYPE_GTT_CACHED_VRAM_UNCACHED; + break; + default: + unreachable("invalid memory type"); + } + + VkMemoryPropertyFlags property_flags = mem_type.propertyFlags | + VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD | + VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD; + + device->mem_type_indices[type_count] = mem_type_id; + device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) { + .propertyFlags = property_flags, + .heapIndex = mem_type.heapIndex, + }; + } + } + device->memory_properties.memoryTypeCount = type_count; + } } static void @@ -314,7 +397,15 @@ radv_physical_device_init(struct radv_physical_device *device, radv_handle_env_var_force_family(device); - radv_get_device_name(device->rad_info.family, device->name, sizeof(device->name)); + device->use_aco = instance->perftest_flags & RADV_PERFTEST_ACO; + if (device->rad_info.chip_class < GFX8 && device->use_aco) { + fprintf(stderr, "WARNING: disabling ACO on unsupported GPUs.\n"); + device->use_aco = false; + } + + snprintf(device->name, sizeof(device->name), + "AMD RADV%s %s (LLVM " MESA_LLVM_VERSION_STRING ")", device->use_aco ? "/ACO" : "", + device->rad_info.name); if (radv_device_get_cache_uuid(device->rad_info.family, device->cache_uuid)) { device->ws->destroy(device->ws); @@ -326,7 +417,7 @@ radv_physical_device_init(struct radv_physical_device *device, /* These flags affect shader compilation. */ uint64_t shader_env_flags = (device->instance->perftest_flags & RADV_PERFTEST_SISCHED ? 0x1 : 0) | - (device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH ? 0x2 : 0); + (device->use_aco ? 0x2 : 0); /* The gpu id is already embedded in the uuid so we just pass "radv" * when creating the cache. @@ -342,46 +433,23 @@ radv_physical_device_init(struct radv_physical_device *device, radv_get_driver_uuid(&device->driver_uuid); radv_get_device_uuid(&device->rad_info, &device->device_uuid); - if (device->rad_info.family == CHIP_STONEY || - device->rad_info.chip_class >= GFX9) { - device->has_rbplus = true; - device->rbplus_allowed = device->rad_info.family == CHIP_STONEY || - device->rad_info.family == CHIP_VEGA12 || - device->rad_info.family == CHIP_RAVEN || - device->rad_info.family == CHIP_RAVEN2; - } - - /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs - * on GFX6. - */ - device->has_clear_state = device->rad_info.chip_class >= GFX7; - - device->cpdma_prefetch_writes_memory = device->rad_info.chip_class <= GFX8; - - /* Vega10/Raven need a special workaround for a hardware bug. */ - device->has_scissor_bug = device->rad_info.family == CHIP_VEGA10 || - device->rad_info.family == CHIP_RAVEN; - - device->has_tc_compat_zrange_bug = device->rad_info.chip_class < GFX10; - - /* Out-of-order primitive rasterization. */ - device->has_out_of_order_rast = device->rad_info.chip_class >= GFX8 && - device->rad_info.max_se >= 2; - device->out_of_order_rast_allowed = device->has_out_of_order_rast && + device->out_of_order_rast_allowed = device->rad_info.has_out_of_order_rast && !(device->instance->debug_flags & RADV_DEBUG_NO_OUT_OF_ORDER); device->dcc_msaa_allowed = (device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA); - /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */ - device->has_load_ctx_reg_pkt = device->rad_info.chip_class >= GFX9 || - (device->rad_info.chip_class >= GFX8 && - device->rad_info.me_fw_feature >= 41); + device->use_shader_ballot = device->use_aco || (device->instance->perftest_flags & RADV_PERFTEST_SHADER_BALLOT); - device->has_dcc_constant_encode = device->rad_info.family == CHIP_RAVEN2 || - device->rad_info.chip_class >= GFX10; + device->use_ngg = device->rad_info.chip_class >= GFX10 && + device->rad_info.family != CHIP_NAVI14 && + !(device->instance->debug_flags & RADV_DEBUG_NO_NGG); + if (device->use_aco && device->use_ngg) { + fprintf(stderr, "WARNING: disabling NGG because ACO is used.\n"); + device->use_ngg = false; + } - device->use_shader_ballot = device->instance->perftest_flags & RADV_PERFTEST_SHADER_BALLOT; + device->use_ngg_streamout = false; /* Determine the number of threads per wave for all stages. */ device->cs_wave_size = 64; @@ -474,7 +542,6 @@ static const struct debug_control radv_debug_options[] = { {"shaderstats", RADV_DEBUG_DUMP_SHADER_STATS}, {"nohiz", RADV_DEBUG_NO_HIZ}, {"nocompute", RADV_DEBUG_NO_COMPUTE_QUEUE}, - {"unsafemath", RADV_DEBUG_UNSAFE_MATH}, {"allbos", RADV_DEBUG_ALL_BOS}, {"noibs", RADV_DEBUG_NO_IBS}, {"spirv", RADV_DEBUG_DUMP_SPIRV}, @@ -493,6 +560,10 @@ static const struct debug_control radv_debug_options[] = { {"nobinning", RADV_DEBUG_NOBINNING}, {"noloadstoreopt", RADV_DEBUG_NO_LOAD_STORE_OPT}, {"nongg", RADV_DEBUG_NO_NGG}, + {"noshaderballot", RADV_DEBUG_NO_SHADER_BALLOT}, + {"allentrypoints", RADV_DEBUG_ALL_ENTRYPOINTS}, + {"metashaders", RADV_DEBUG_DUMP_META_SHADERS}, + {"nomemorycache", RADV_DEBUG_NO_MEMORY_CACHE}, {NULL, 0} }; @@ -514,6 +585,8 @@ static const struct debug_control radv_perftest_options[] = { {"cswave32", RADV_PERFTEST_CS_WAVE_32}, {"pswave32", RADV_PERFTEST_PS_WAVE_32}, {"gewave32", RADV_PERFTEST_GE_WAVE_32}, + {"dfsm", RADV_PERFTEST_DFSM}, + {"aco", RADV_PERFTEST_ACO}, {NULL, 0} }; @@ -549,8 +622,24 @@ radv_handle_per_app_options(struct radv_instance *instance, * load/store memory operations. * See https://reviews.llvm.org/D61313 */ - if (HAVE_LLVM < 0x900) + if (LLVM_VERSION_MAJOR < 9) instance->debug_flags |= RADV_DEBUG_NO_LOAD_STORE_OPT; + } else if (!strcmp(name, "Wolfenstein: Youngblood")) { + if (!(instance->debug_flags & RADV_DEBUG_NO_SHADER_BALLOT)) { + /* Force enable VK_AMD_shader_ballot because it looks + * safe and it gives a nice boost (+20% on Vega 56 at + * this time). + */ + instance->perftest_flags |= RADV_PERFTEST_SHADER_BALLOT; + } + } else if (!strcmp(name, "Fledge")) { + /* + * Zero VRAM for "The Surge 2" + * + * This avoid a hang when when rendering any level. Likely + * uninitialized data in an indirect draw. + */ + instance->debug_flags |= RADV_DEBUG_ZERO_VRAM; } } @@ -565,8 +654,10 @@ static int radv_get_instance_extension_index(const char *name) static const char radv_dri_options_xml[] = DRI_CONF_BEGIN - DRI_CONF_SECTION_QUALITY + DRI_CONF_SECTION_PERFORMANCE DRI_CONF_ADAPTIVE_SYNC("true") + DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0) + DRI_CONF_VK_X11_STRICT_IMAGE_COUNT("false") DRI_CONF_SECTION_END DRI_CONF_END; @@ -575,7 +666,9 @@ static void radv_init_dri_options(struct radv_instance *instance) driParseOptionInfo(&instance->available_dri_options, radv_dri_options_xml); driParseConfigFiles(&instance->dri_options, &instance->available_dri_options, - 0, "radv", NULL); + 0, "radv", NULL, + instance->engineName, + instance->engineVersion); } VkResult radv_CreateInstance( @@ -596,6 +689,13 @@ VkResult radv_CreateInstance( client_version = VK_API_VERSION_1_0; } + const char *engine_name = NULL; + uint32_t engine_version = 0; + if (pCreateInfo->pApplicationInfo) { + engine_name = pCreateInfo->pApplicationInfo->pEngineName; + engine_version = pCreateInfo->pApplicationInfo->engineVersion; + } + instance = vk_zalloc2(&default_alloc, pAllocator, sizeof(*instance), 8, VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); if (!instance) @@ -611,12 +711,24 @@ VkResult radv_CreateInstance( instance->apiVersion = client_version; instance->physicalDeviceCount = -1; + /* Get secure compile thread count. NOTE: We cap this at 32 */ +#define MAX_SC_PROCS 32 + char *num_sc_threads = getenv("RADV_SECURE_COMPILE_THREADS"); + if (num_sc_threads) + instance->num_sc_threads = MIN2(strtoul(num_sc_threads, NULL, 10), MAX_SC_PROCS); + instance->debug_flags = parse_debug_string(getenv("RADV_DEBUG"), radv_debug_options); + /* Disable memory cache when secure compile is set */ + if (radv_device_use_secure_compile(instance)) + instance->debug_flags |= RADV_DEBUG_NO_MEMORY_CACHE; + instance->perftest_flags = parse_debug_string(getenv("RADV_PERFTEST"), radv_perftest_options); + if (instance->perftest_flags & RADV_PERFTEST_ACO) + fprintf(stderr, "WARNING: Experimental compiler backend enabled. Here be dragons! Incorrect rendering, GPU hangs and/or resets are likely\n"); if (instance->debug_flags & RADV_DEBUG_STARTUP) radv_logi("Created an instance"); @@ -639,7 +751,10 @@ VkResult radv_CreateInstance( return vk_error(instance, result); } - _mesa_locale_init(); + instance->engineName = vk_strdup(&instance->alloc, engine_name, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + instance->engineVersion = engine_version; + glsl_type_singleton_init_or_ref(); VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false)); @@ -665,10 +780,11 @@ void radv_DestroyInstance( radv_physical_device_finish(instance->physicalDevices + i); } + vk_free(&instance->alloc, instance->engineName); + VG(VALGRIND_DESTROY_MEMPOOL(instance)); glsl_type_singleton_decref(); - _mesa_locale_fini(); driDestroyOptionCache(&instance->dri_options); driDestroyOptionInfo(&instance->available_dri_options); @@ -821,7 +937,7 @@ void radv_GetPhysicalDeviceFeatures( .shaderCullDistance = true, .shaderFloat64 = true, .shaderInt64 = true, - .shaderInt16 = pdevice->rad_info.chip_class >= GFX9, + .shaderInt16 = pdevice->rad_info.chip_class >= GFX9 && !pdevice->use_aco, .sparseBinding = true, .variableMultisampleRate = true, .inheritedQueries = true, @@ -863,11 +979,11 @@ void radv_GetPhysicalDeviceFeatures2( case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: { VkPhysicalDevice16BitStorageFeatures *features = (VkPhysicalDevice16BitStorageFeatures*)ext; - bool enabled = pdevice->rad_info.chip_class >= GFX8; + bool enabled = pdevice->rad_info.chip_class >= GFX8 && !pdevice->use_aco; features->storageBuffer16BitAccess = enabled; features->uniformAndStorageBuffer16BitAccess = enabled; features->storagePushConstant16 = enabled; - features->storageInputOutput16 = enabled && HAVE_LLVM >= 0x900; + features->storageInputOutput16 = enabled && LLVM_VERSION_MAJOR >= 9; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: { @@ -919,7 +1035,7 @@ void radv_GetPhysicalDeviceFeatures2( VkPhysicalDeviceTransformFeedbackFeaturesEXT *features = (VkPhysicalDeviceTransformFeedbackFeaturesEXT*)ext; features->transformFeedback = true; - features->geometryStreams = true; + features->geometryStreams = !pdevice->use_ngg_streamout; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES_EXT: { @@ -957,24 +1073,30 @@ void radv_GetPhysicalDeviceFeatures2( case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR: { VkPhysicalDevice8BitStorageFeaturesKHR *features = (VkPhysicalDevice8BitStorageFeaturesKHR*)ext; - bool enabled = pdevice->rad_info.chip_class >= GFX8; + bool enabled = pdevice->rad_info.chip_class >= GFX8 && !pdevice->use_aco; features->storageBuffer8BitAccess = enabled; features->uniformAndStorageBuffer8BitAccess = enabled; features->storagePushConstant8 = enabled; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: { - VkPhysicalDeviceFloat16Int8FeaturesKHR *features = - (VkPhysicalDeviceFloat16Int8FeaturesKHR*)ext; - features->shaderFloat16 = pdevice->rad_info.chip_class >= GFX8 && HAVE_LLVM >= 0x0800; - features->shaderInt8 = true; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR: { + VkPhysicalDeviceShaderFloat16Int8FeaturesKHR *features = + (VkPhysicalDeviceShaderFloat16Int8FeaturesKHR*)ext; + features->shaderFloat16 = pdevice->rad_info.chip_class >= GFX8 && !pdevice->use_aco; + features->shaderInt8 = !pdevice->use_aco; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_INT64_FEATURES_KHR: { VkPhysicalDeviceShaderAtomicInt64FeaturesKHR *features = (VkPhysicalDeviceShaderAtomicInt64FeaturesKHR *)ext; - features->shaderBufferInt64Atomics = HAVE_LLVM >= 0x0900; - features->shaderSharedInt64Atomics = HAVE_LLVM >= 0x0900; + features->shaderBufferInt64Atomics = LLVM_VERSION_MAJOR >= 9; + features->shaderSharedInt64Atomics = LLVM_VERSION_MAJOR >= 9; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DEMOTE_TO_HELPER_INVOCATION_FEATURES_EXT: { + VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT *features = + (VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT *)ext; + features->shaderDemoteToHelperInvocation = pdevice->use_aco; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT: { @@ -1022,6 +1144,44 @@ void radv_GetPhysicalDeviceFeatures2( features->pipelineExecutableInfo = true; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CLOCK_FEATURES_KHR: { + VkPhysicalDeviceShaderClockFeaturesKHR *features = + (VkPhysicalDeviceShaderClockFeaturesKHR *)ext; + features->shaderSubgroupClock = true; + features->shaderDeviceClock = false; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT: { + VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *features = + (VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *)ext; + features->texelBufferAlignment = true; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR: { + VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *features = + (VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *) ext; + features->timelineSemaphore = true; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT: { + VkPhysicalDeviceSubgroupSizeControlFeaturesEXT *features = + (VkPhysicalDeviceSubgroupSizeControlFeaturesEXT *)ext; + features->subgroupSizeControl = true; + features->computeFullSubgroups = true; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COHERENT_MEMORY_FEATURES_AMD: { + VkPhysicalDeviceCoherentMemoryFeaturesAMD *features = + (VkPhysicalDeviceCoherentMemoryFeaturesAMD *)ext; + features->deviceCoherentMemory = pdevice->rad_info.has_l2_uncached; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_EXTENDED_TYPES_FEATURES_KHR: { + VkPhysicalDeviceShaderSubgroupExtendedTypesFeaturesKHR *features = + (VkPhysicalDeviceShaderSubgroupExtendedTypesFeaturesKHR *)ext; + features->shaderSubgroupExtendedTypes = true; + break; + } default: break; } @@ -1121,7 +1281,7 @@ void radv_GetPhysicalDeviceProperties( .viewportBoundsRange = { INT16_MIN, INT16_MAX }, .viewportSubPixelBits = 8, .minMemoryMapAlignment = 4096, /* A page */ - .minTexelBufferOffsetAlignment = 1, + .minTexelBufferOffsetAlignment = 4, .minUniformBufferOffsetAlignment = 4, .minStorageBufferOffsetAlignment = 4, .minTexelOffset = -32, @@ -1229,12 +1389,14 @@ void radv_GetPhysicalDeviceProperties2( properties->supportedStages = VK_SHADER_STAGE_ALL; properties->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | + VK_SUBGROUP_FEATURE_VOTE_BIT | + VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | VK_SUBGROUP_FEATURE_BALLOT_BIT | - VK_SUBGROUP_FEATURE_QUAD_BIT | - VK_SUBGROUP_FEATURE_VOTE_BIT; - if (pdevice->rad_info.chip_class >= GFX8) { + VK_SUBGROUP_FEATURE_CLUSTERED_BIT | + VK_SUBGROUP_FEATURE_QUAD_BIT; + if (pdevice->rad_info.chip_class == GFX8 || + pdevice->rad_info.chip_class == GFX9) { properties->supportedOperations |= - VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | VK_SUBGROUP_FEATURE_SHUFFLE_BIT | VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT; } @@ -1282,7 +1444,7 @@ void radv_GetPhysicalDeviceProperties2( /* SGPR. */ properties->sgprsPerSimd = - ac_get_num_physical_sgprs(pdevice->rad_info.chip_class); + pdevice->rad_info.num_physical_sgprs_per_simd; properties->minSgprAllocation = pdevice->rad_info.chip_class >= GFX8 ? 16 : 8; properties->maxSgprAllocation = @@ -1298,6 +1460,15 @@ void radv_GetPhysicalDeviceProperties2( properties->vgprAllocationGranularity = 4; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_PROPERTIES_2_AMD: { + VkPhysicalDeviceShaderCoreProperties2AMD *properties = + (VkPhysicalDeviceShaderCoreProperties2AMD *)ext; + + properties->shaderCoreFeatures = 0; + properties->activeComputeUnitCount = + pdevice->rad_info.num_good_compute_units; + break; + } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: { VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *properties = (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext; @@ -1396,8 +1567,8 @@ void radv_GetPhysicalDeviceProperties2( properties->maxTransformFeedbackStreamDataSize = 512; properties->maxTransformFeedbackBufferDataSize = UINT32_MAX; properties->maxTransformFeedbackBufferDataStride = 512; - properties->transformFeedbackQueries = true; - properties->transformFeedbackStreamsLinesTriangles = true; + properties->transformFeedbackQueries = !pdevice->use_ngg_streamout; + properties->transformFeedbackStreamsLinesTriangles = !pdevice->use_ngg_streamout; properties->transformFeedbackRasterizationStreamSelect = false; properties->transformFeedbackDraw = true; break; @@ -1447,6 +1618,76 @@ void radv_GetPhysicalDeviceProperties2( properties->independentResolve = VK_TRUE; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT: { + VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *properties = + (VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *)ext; + properties->storageTexelBufferOffsetAlignmentBytes = 4; + properties->storageTexelBufferOffsetSingleTexelAlignment = true; + properties->uniformTexelBufferOffsetAlignmentBytes = 4; + properties->uniformTexelBufferOffsetSingleTexelAlignment = true; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR : { + VkPhysicalDeviceFloatControlsPropertiesKHR *properties = + (VkPhysicalDeviceFloatControlsPropertiesKHR *)ext; + + /* On AMD hardware, denormals and rounding modes for + * fp16/fp64 are controlled by the same config + * register. + */ + properties->denormBehaviorIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY_KHR; + properties->roundingModeIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY_KHR; + + /* Do not allow both preserving and flushing denorms + * because different shaders in the same pipeline can + * have different settings and this won't work for + * merged shaders. To make it work, this requires LLVM + * support for changing the register. The same logic + * applies for the rounding modes because they are + * configured with the same config register. + * TODO: we can enable a lot of these for ACO when it + * supports all stages + */ + properties->shaderDenormFlushToZeroFloat32 = true; + properties->shaderDenormPreserveFloat32 = false; + properties->shaderRoundingModeRTEFloat32 = true; + properties->shaderRoundingModeRTZFloat32 = false; + properties->shaderSignedZeroInfNanPreserveFloat32 = true; + + properties->shaderDenormFlushToZeroFloat16 = false; + properties->shaderDenormPreserveFloat16 = pdevice->rad_info.chip_class >= GFX8; + properties->shaderRoundingModeRTEFloat16 = pdevice->rad_info.chip_class >= GFX8; + properties->shaderRoundingModeRTZFloat16 = false; + properties->shaderSignedZeroInfNanPreserveFloat16 = pdevice->rad_info.chip_class >= GFX8; + + properties->shaderDenormFlushToZeroFloat64 = false; + properties->shaderDenormPreserveFloat64 = pdevice->rad_info.chip_class >= GFX8; + properties->shaderRoundingModeRTEFloat64 = pdevice->rad_info.chip_class >= GFX8; + properties->shaderRoundingModeRTZFloat64 = false; + properties->shaderSignedZeroInfNanPreserveFloat64 = pdevice->rad_info.chip_class >= GFX8; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES_KHR: { + VkPhysicalDeviceTimelineSemaphorePropertiesKHR *props = + (VkPhysicalDeviceTimelineSemaphorePropertiesKHR *) ext; + props->maxTimelineSemaphoreValueDifference = UINT64_MAX; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT: { + VkPhysicalDeviceSubgroupSizeControlPropertiesEXT *props = + (VkPhysicalDeviceSubgroupSizeControlPropertiesEXT *)ext; + props->minSubgroupSize = 64; + props->maxSubgroupSize = 64; + props->maxComputeWorkgroupSubgroups = UINT32_MAX; + props->requiredSubgroupSizeStages = 0; + + if (pdevice->rad_info.chip_class >= GFX10) { + /* Only GFX10+ supports wave32. */ + props->minSubgroupSize = 32; + props->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT; + } + break; + } default: break; } @@ -1460,7 +1701,7 @@ static void radv_get_physical_device_queue_family_properties( { int num_queue_families = 1; int idx; - if (pdevice->rad_info.num_compute_rings > 0 && + if (pdevice->rad_info.num_rings[RING_COMPUTE] > 0 && !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) num_queue_families++; @@ -1486,14 +1727,14 @@ static void radv_get_physical_device_queue_family_properties( idx++; } - if (pdevice->rad_info.num_compute_rings > 0 && + if (pdevice->rad_info.num_rings[RING_COMPUTE] > 0 && !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) { if (*pCount > idx) { *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { .queueFlags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT, - .queueCount = pdevice->rad_info.num_compute_rings, + .queueCount = pdevice->rad_info.num_rings[RING_COMPUTE], .timestampValidBits = 64, .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 }, }; @@ -1573,8 +1814,7 @@ radv_get_memory_budget_properties(VkPhysicalDevice physicalDevice, for (int i = 0; i < device->memory_properties.memoryTypeCount; i++) { uint32_t heap_index = device->memory_properties.memoryTypes[i].heapIndex; - switch (device->mem_type_indices[i]) { - case RADV_MEM_TYPE_VRAM: + if (radv_is_mem_type_vram(device->mem_type_indices[i])) { heap_usage = device->ws->query_value(device->ws, RADEON_ALLOCATED_VRAM); @@ -1584,8 +1824,7 @@ radv_get_memory_budget_properties(VkPhysicalDevice physicalDevice, memoryBudget->heapBudget[heap_index] = heap_budget; memoryBudget->heapUsage[heap_index] = heap_usage; - break; - case RADV_MEM_TYPE_VRAM_CPU_ACCESS: + } else if (radv_is_mem_type_vram_visible(device->mem_type_indices[i])) { heap_usage = device->ws->query_value(device->ws, RADEON_ALLOCATED_VRAM_VIS); @@ -1595,8 +1834,7 @@ radv_get_memory_budget_properties(VkPhysicalDevice physicalDevice, memoryBudget->heapBudget[heap_index] = heap_budget; memoryBudget->heapUsage[heap_index] = heap_usage; - break; - case RADV_MEM_TYPE_GTT_WRITE_COMBINE: + } else if (radv_is_mem_type_gtt_wc(device->mem_type_indices[i])) { heap_usage = device->ws->query_value(device->ws, RADEON_ALLOCATED_GTT); @@ -1606,9 +1844,6 @@ radv_get_memory_budget_properties(VkPhysicalDevice physicalDevice, memoryBudget->heapBudget[heap_index] = heap_budget; memoryBudget->heapUsage[heap_index] = heap_usage; - break; - default: - break; } } @@ -1650,7 +1885,7 @@ VkResult radv_GetMemoryHostPointerPropertiesEXT( const struct radv_physical_device *physical_device = device->physical_device; uint32_t memoryTypeBits = 0; for (int i = 0; i < physical_device->memory_properties.memoryTypeCount; i++) { - if (physical_device->mem_type_indices[i] == RADV_MEM_TYPE_GTT_CACHED) { + if (radv_is_mem_type_gtt_cached(physical_device->mem_type_indices[i])) { memoryTypeBits = (1 << i); break; } @@ -1702,12 +1937,17 @@ radv_queue_init(struct radv_device *device, struct radv_queue *queue, if (!queue->hw_ctx) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + list_inithead(&queue->pending_submissions); + pthread_mutex_init(&queue->pending_mutex, NULL); + return VK_SUCCESS; } static void radv_queue_finish(struct radv_queue *queue) { + pthread_mutex_destroy(&queue->pending_mutex); + if (queue->hw_ctx) queue->device->ws->ctx_destroy(queue->hw_ctx); @@ -1727,6 +1967,10 @@ radv_queue_finish(struct radv_queue *queue) queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); if (queue->tess_rings_bo) queue->device->ws->buffer_destroy(queue->tess_rings_bo); + if (queue->gds_bo) + queue->device->ws->buffer_destroy(queue->gds_bo); + if (queue->gds_oa_bo) + queue->device->ws->buffer_destroy(queue->gds_oa_bo); if (queue->compute_scratch_bo) queue->device->ws->buffer_destroy(queue->compute_scratch_bo); } @@ -1836,6 +2080,553 @@ radv_get_int_debug_option(const char *name, int default_value) return result; } +static int install_seccomp_filter() { + + struct sock_filter filter[] = { + /* Check arch is 64bit x86 */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, arch))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, AUDIT_ARCH_X86_64, 0, 12), + + /* Futex is required for mutex locks */ + #if defined __NR__newselect + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR__newselect, 11, 0), + #elif defined __NR_select + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_select, 11, 0), + #else + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_pselect6, 11, 0), + #endif + + /* Allow system exit calls for the forked process */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_exit_group, 9, 0), + + /* Allow system read calls */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_read, 7, 0), + + /* Allow system write calls */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_write, 5, 0), + + /* Allow system brk calls (we need this for malloc) */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_brk, 3, 0), + + /* Futex is required for mutex locks */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_futex, 1, 0), + + /* Return error if we hit a system call not on the whitelist */ + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ERRNO | (EPERM & SECCOMP_RET_DATA)), + + /* Allow whitelisted system calls */ + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog prog = { + .len = (unsigned short)(sizeof(filter) / sizeof(filter[0])), + .filter = filter, + }; + + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) + return -1; + + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) + return -1; + + return 0; +} + +/* Helper function with timeout support for reading from the pipe between + * processes used for secure compile. + */ +bool radv_sc_read(int fd, void *buf, size_t size, bool timeout) +{ + fd_set fds; + struct timeval tv; + + FD_ZERO(&fds); + FD_SET(fd, &fds); + + while (true) { + /* We can't rely on the value of tv after calling select() so + * we must reset it on each iteration of the loop. + */ + tv.tv_sec = 5; + tv.tv_usec = 0; + + int rval = select(fd + 1, &fds, NULL, NULL, timeout ? &tv : NULL); + + if (rval == -1) { + /* select error */ + return false; + } else if (rval) { + ssize_t bytes_read = read(fd, buf, size); + if (bytes_read < 0) + return false; + + buf += bytes_read; + size -= bytes_read; + if (size == 0) + return true; + } else { + /* select timeout */ + return false; + } + } +} + +static bool radv_close_all_fds(const int *keep_fds, int keep_fd_count) +{ + DIR *d; + struct dirent *dir; + d = opendir("/proc/self/fd"); + if (!d) + return false; + int dir_fd = dirfd(d); + + while ((dir = readdir(d)) != NULL) { + if (dir->d_name[0] == '.') + continue; + + int fd = atoi(dir->d_name); + if (fd == dir_fd) + continue; + + bool keep = false; + for (int i = 0; !keep && i < keep_fd_count; ++i) + if (keep_fds[i] == fd) + keep = true; + + if (keep) + continue; + + close(fd); + } + closedir(d); + return true; +} + +static bool secure_compile_open_fifo_fds(struct radv_secure_compile_state *sc, + int *fd_server, int *fd_client, + unsigned process, bool make_fifo) +{ + bool result = false; + char *fifo_server_path = NULL; + char *fifo_client_path = NULL; + + if (asprintf(&fifo_server_path, "/tmp/radv_server_%s_%u", sc->uid, process) == -1) + goto open_fifo_exit; + + if (asprintf(&fifo_client_path, "/tmp/radv_client_%s_%u", sc->uid, process) == -1) + goto open_fifo_exit; + + if (make_fifo) { + int file1 = mkfifo(fifo_server_path, 0666); + if(file1 < 0) + goto open_fifo_exit; + + int file2 = mkfifo(fifo_client_path, 0666); + if(file2 < 0) + goto open_fifo_exit; + } + + *fd_server = open(fifo_server_path, O_RDWR); + if(*fd_server < 1) + goto open_fifo_exit; + + *fd_client = open(fifo_client_path, O_RDWR); + if(*fd_client < 1) { + close(*fd_server); + goto open_fifo_exit; + } + + result = true; + +open_fifo_exit: + free(fifo_server_path); + free(fifo_client_path); + + return result; +} + +static void run_secure_compile_device(struct radv_device *device, unsigned process, + int fd_idle_device_output) +{ + int fd_secure_input; + int fd_secure_output; + bool fifo_result = secure_compile_open_fifo_fds(device->sc_state, + &fd_secure_input, + &fd_secure_output, + process, false); + + enum radv_secure_compile_type sc_type; + + const int needed_fds[] = { + fd_secure_input, + fd_secure_output, + fd_idle_device_output, + }; + + if (!fifo_result || !radv_close_all_fds(needed_fds, ARRAY_SIZE(needed_fds)) || + install_seccomp_filter() == -1) { + sc_type = RADV_SC_TYPE_INIT_FAILURE; + } else { + sc_type = RADV_SC_TYPE_INIT_SUCCESS; + device->sc_state->secure_compile_processes[process].fd_secure_input = fd_secure_input; + device->sc_state->secure_compile_processes[process].fd_secure_output = fd_secure_output; + } + + write(fd_idle_device_output, &sc_type, sizeof(sc_type)); + + if (sc_type == RADV_SC_TYPE_INIT_FAILURE) + goto secure_compile_exit; + + while (true) { + radv_sc_read(fd_secure_input, &sc_type, sizeof(sc_type), false); + + if (sc_type == RADV_SC_TYPE_COMPILE_PIPELINE) { + struct radv_pipeline *pipeline; + bool sc_read = true; + + pipeline = vk_zalloc2(&device->alloc, NULL, sizeof(*pipeline), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + pipeline->device = device; + + /* Read pipeline layout */ + struct radv_pipeline_layout layout; + sc_read = radv_sc_read(fd_secure_input, &layout, sizeof(struct radv_pipeline_layout), true); + sc_read &= radv_sc_read(fd_secure_input, &layout.num_sets, sizeof(uint32_t), true); + if (!sc_read) + goto secure_compile_exit; + + for (uint32_t set = 0; set < layout.num_sets; set++) { + uint32_t layout_size; + sc_read &= radv_sc_read(fd_secure_input, &layout_size, sizeof(uint32_t), true); + if (!sc_read) + goto secure_compile_exit; + + layout.set[set].layout = malloc(layout_size); + layout.set[set].layout->layout_size = layout_size; + sc_read &= radv_sc_read(fd_secure_input, layout.set[set].layout, + layout.set[set].layout->layout_size, true); + } + + pipeline->layout = &layout; + + /* Read pipeline key */ + struct radv_pipeline_key key; + sc_read &= radv_sc_read(fd_secure_input, &key, sizeof(struct radv_pipeline_key), true); + + /* Read pipeline create flags */ + VkPipelineCreateFlags flags; + sc_read &= radv_sc_read(fd_secure_input, &flags, sizeof(VkPipelineCreateFlags), true); + + /* Read stage and shader information */ + uint32_t num_stages; + const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = { 0, }; + sc_read &= radv_sc_read(fd_secure_input, &num_stages, sizeof(uint32_t), true); + if (!sc_read) + goto secure_compile_exit; + + for (uint32_t i = 0; i < num_stages; i++) { + + /* Read stage */ + gl_shader_stage stage; + sc_read &= radv_sc_read(fd_secure_input, &stage, sizeof(gl_shader_stage), true); + + VkPipelineShaderStageCreateInfo *pStage = calloc(1, sizeof(VkPipelineShaderStageCreateInfo)); + + /* Read entry point name */ + size_t name_size; + sc_read &= radv_sc_read(fd_secure_input, &name_size, sizeof(size_t), true); + if (!sc_read) + goto secure_compile_exit; + + char *ep_name = malloc(name_size); + sc_read &= radv_sc_read(fd_secure_input, ep_name, name_size, true); + pStage->pName = ep_name; + + /* Read shader module */ + size_t module_size; + sc_read &= radv_sc_read(fd_secure_input, &module_size, sizeof(size_t), true); + if (!sc_read) + goto secure_compile_exit; + + struct radv_shader_module *module = malloc(module_size); + sc_read &= radv_sc_read(fd_secure_input, module, module_size, true); + pStage->module = radv_shader_module_to_handle(module); + + /* Read specialization info */ + bool has_spec_info; + sc_read &= radv_sc_read(fd_secure_input, &has_spec_info, sizeof(bool), true); + if (!sc_read) + goto secure_compile_exit; + + if (has_spec_info) { + VkSpecializationInfo *specInfo = malloc(sizeof(VkSpecializationInfo)); + pStage->pSpecializationInfo = specInfo; + + sc_read &= radv_sc_read(fd_secure_input, &specInfo->dataSize, sizeof(size_t), true); + if (!sc_read) + goto secure_compile_exit; + + void *si_data = malloc(specInfo->dataSize); + sc_read &= radv_sc_read(fd_secure_input, si_data, specInfo->dataSize, true); + specInfo->pData = si_data; + + sc_read &= radv_sc_read(fd_secure_input, &specInfo->mapEntryCount, sizeof(uint32_t), true); + if (!sc_read) + goto secure_compile_exit; + + VkSpecializationMapEntry *mapEntries = malloc(sizeof(VkSpecializationMapEntry) * specInfo->mapEntryCount); + for (uint32_t j = 0; j < specInfo->mapEntryCount; j++) { + sc_read &= radv_sc_read(fd_secure_input, &mapEntries[j], sizeof(VkSpecializationMapEntry), true); + if (!sc_read) + goto secure_compile_exit; + } + + specInfo->pMapEntries = mapEntries; + } + + pStages[stage] = pStage; + } + + /* Compile the shaders */ + VkPipelineCreationFeedbackEXT *stage_feedbacks[MESA_SHADER_STAGES] = { 0 }; + radv_create_shaders(pipeline, device, NULL, &key, pStages, flags, NULL, stage_feedbacks); + + /* free memory allocated above */ + for (uint32_t set = 0; set < layout.num_sets; set++) + free(layout.set[set].layout); + + for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) { + if (!pStages[i]) + continue; + + free((void *) pStages[i]->pName); + free(radv_shader_module_from_handle(pStages[i]->module)); + if (pStages[i]->pSpecializationInfo) { + free((void *) pStages[i]->pSpecializationInfo->pData); + free((void *) pStages[i]->pSpecializationInfo->pMapEntries); + free((void *) pStages[i]->pSpecializationInfo); + } + free((void *) pStages[i]); + } + + vk_free(&device->alloc, pipeline); + + sc_type = RADV_SC_TYPE_COMPILE_PIPELINE_FINISHED; + write(fd_secure_output, &sc_type, sizeof(sc_type)); + + } else if (sc_type == RADV_SC_TYPE_DESTROY_DEVICE) { + goto secure_compile_exit; + } + } + +secure_compile_exit: + close(fd_secure_input); + close(fd_secure_output); + close(fd_idle_device_output); + _exit(0); +} + +static enum radv_secure_compile_type fork_secure_compile_device(struct radv_device *device, unsigned process) +{ + int fd_secure_input[2]; + int fd_secure_output[2]; + + /* create pipe descriptors (used to communicate between processes) */ + if (pipe(fd_secure_input) == -1 || pipe(fd_secure_output) == -1) + return RADV_SC_TYPE_INIT_FAILURE; + + + int sc_pid; + if ((sc_pid = fork()) == 0) { + device->sc_state->secure_compile_thread_counter = process; + run_secure_compile_device(device, process, fd_secure_output[1]); + } else { + if (sc_pid == -1) + return RADV_SC_TYPE_INIT_FAILURE; + + /* Read the init result returned from the secure process */ + enum radv_secure_compile_type sc_type; + bool sc_read = radv_sc_read(fd_secure_output[0], &sc_type, sizeof(sc_type), true); + + if (sc_type == RADV_SC_TYPE_INIT_FAILURE || !sc_read) { + close(fd_secure_input[0]); + close(fd_secure_input[1]); + close(fd_secure_output[1]); + close(fd_secure_output[0]); + int status; + waitpid(sc_pid, &status, 0); + + return RADV_SC_TYPE_INIT_FAILURE; + } else { + assert(sc_type == RADV_SC_TYPE_INIT_SUCCESS); + write(device->sc_state->secure_compile_processes[process].fd_secure_output, &sc_type, sizeof(sc_type)); + + close(fd_secure_input[0]); + close(fd_secure_input[1]); + close(fd_secure_output[1]); + close(fd_secure_output[0]); + + int status; + waitpid(sc_pid, &status, 0); + } + } + + return RADV_SC_TYPE_INIT_SUCCESS; +} + +/* Run a bare bones fork of a device that was forked right after its creation. + * This device will have low overhead when it is forked again before each + * pipeline compilation. This device sits idle and its only job is to fork + * itself. + */ +static void run_secure_compile_idle_device(struct radv_device *device, unsigned process, + int fd_secure_input, int fd_secure_output) +{ + enum radv_secure_compile_type sc_type = RADV_SC_TYPE_INIT_SUCCESS; + device->sc_state->secure_compile_processes[process].fd_secure_input = fd_secure_input; + device->sc_state->secure_compile_processes[process].fd_secure_output = fd_secure_output; + + write(fd_secure_output, &sc_type, sizeof(sc_type)); + + while (true) { + radv_sc_read(fd_secure_input, &sc_type, sizeof(sc_type), false); + + if (sc_type == RADV_SC_TYPE_FORK_DEVICE) { + sc_type = fork_secure_compile_device(device, process); + + if (sc_type == RADV_SC_TYPE_INIT_FAILURE) + goto secure_compile_exit; + + } else if (sc_type == RADV_SC_TYPE_DESTROY_DEVICE) { + goto secure_compile_exit; + } + } + +secure_compile_exit: + close(fd_secure_input); + close(fd_secure_output); + _exit(0); +} + +static void destroy_secure_compile_device(struct radv_device *device, unsigned process) +{ + int fd_secure_input = device->sc_state->secure_compile_processes[process].fd_secure_input; + + enum radv_secure_compile_type sc_type = RADV_SC_TYPE_DESTROY_DEVICE; + write(fd_secure_input, &sc_type, sizeof(sc_type)); + + close(device->sc_state->secure_compile_processes[process].fd_secure_input); + close(device->sc_state->secure_compile_processes[process].fd_secure_output); + + int status; + waitpid(device->sc_state->secure_compile_processes[process].sc_pid, &status, 0); +} + +static VkResult fork_secure_compile_idle_device(struct radv_device *device) +{ + device->sc_state = vk_zalloc(&device->alloc, + sizeof(struct radv_secure_compile_state), + 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + mtx_init(&device->sc_state->secure_compile_mutex, mtx_plain); + + pid_t upid = getpid(); + time_t seconds = time(NULL); + + char *uid; + if (asprintf(&uid, "%ld_%ld", (long) upid, (long) seconds) == -1) + return VK_ERROR_INITIALIZATION_FAILED; + + device->sc_state->uid = uid; + + uint8_t sc_threads = device->instance->num_sc_threads; + int fd_secure_input[MAX_SC_PROCS][2]; + int fd_secure_output[MAX_SC_PROCS][2]; + + /* create pipe descriptors (used to communicate between processes) */ + for (unsigned i = 0; i < sc_threads; i++) { + if (pipe(fd_secure_input[i]) == -1 || + pipe(fd_secure_output[i]) == -1) { + return VK_ERROR_INITIALIZATION_FAILED; + } + } + + device->sc_state->secure_compile_processes = vk_zalloc(&device->alloc, + sizeof(struct radv_secure_compile_process) * sc_threads, 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + for (unsigned process = 0; process < sc_threads; process++) { + if ((device->sc_state->secure_compile_processes[process].sc_pid = fork()) == 0) { + device->sc_state->secure_compile_thread_counter = process; + run_secure_compile_idle_device(device, process, fd_secure_input[process][0], fd_secure_output[process][1]); + } else { + if (device->sc_state->secure_compile_processes[process].sc_pid == -1) + return VK_ERROR_INITIALIZATION_FAILED; + + /* Read the init result returned from the secure process */ + enum radv_secure_compile_type sc_type; + bool sc_read = radv_sc_read(fd_secure_output[process][0], &sc_type, sizeof(sc_type), true); + + bool fifo_result; + if (sc_read && sc_type == RADV_SC_TYPE_INIT_SUCCESS) { + fifo_result = secure_compile_open_fifo_fds(device->sc_state, + &device->sc_state->secure_compile_processes[process].fd_server, + &device->sc_state->secure_compile_processes[process].fd_client, + process, true); + + device->sc_state->secure_compile_processes[process].fd_secure_input = fd_secure_input[process][1]; + device->sc_state->secure_compile_processes[process].fd_secure_output = fd_secure_output[process][0]; + } + + if (sc_type == RADV_SC_TYPE_INIT_FAILURE || !sc_read || !fifo_result) { + close(fd_secure_input[process][0]); + close(fd_secure_input[process][1]); + close(fd_secure_output[process][1]); + close(fd_secure_output[process][0]); + int status; + waitpid(device->sc_state->secure_compile_processes[process].sc_pid, &status, 0); + + /* Destroy any forks that were created sucessfully */ + for (unsigned i = 0; i < process; i++) { + destroy_secure_compile_device(device, i); + } + + return VK_ERROR_INITIALIZATION_FAILED; + } + } + } + return VK_SUCCESS; +} + +static VkResult +radv_create_pthread_cond(pthread_cond_t *cond) +{ + pthread_condattr_t condattr; + if (pthread_condattr_init(&condattr)) { + return VK_ERROR_INITIALIZATION_FAILED; + } + + if (pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC)) { + pthread_condattr_destroy(&condattr); + return VK_ERROR_INITIALIZATION_FAILED; + } + if (pthread_cond_init(cond, &condattr)) { + pthread_condattr_destroy(&condattr); + return VK_ERROR_INITIALIZATION_FAILED; + } + pthread_condattr_destroy(&condattr); + return VK_SUCCESS; +} + VkResult radv_CreateDevice( VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo* pCreateInfo, @@ -1937,14 +2728,11 @@ VkResult radv_CreateDevice( device->pbb_allowed = device->physical_device->rad_info.chip_class >= GFX9 && !(device->instance->debug_flags & RADV_DEBUG_NOBINNING); - /* Disabled and not implemented for now. */ + /* Disable DFSM by default. As of 2019-09-15 Talos on Low is still 3% slower on Raven. */ device->dfsm_allowed = device->pbb_allowed && - (device->physical_device->rad_info.family == CHIP_RAVEN || - device->physical_device->rad_info.family == CHIP_RAVEN2); + (device->instance->perftest_flags & RADV_PERFTEST_DFSM); -#ifdef ANDROID device->always_use_syncobj = device->physical_device->rad_info.has_syncobj_wait_for_submit; -#endif /* The maximum number of scratch waves. Scratch space isn't divided * evenly between CUs. The number is only a function of the number of CUs. @@ -1962,8 +2750,7 @@ VkResult radv_CreateDevice( device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units, max_threads_per_block / 64); - device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | - S_00B800_CS_W32_EN(device->physical_device->cs_wave_size == 32); + device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1); if (device->physical_device->rad_info.chip_class >= GFX7) { /* If the KMD allows it (there is a KMD hw register for it), @@ -1976,9 +2763,6 @@ VkResult radv_CreateDevice( device->tess_offchip_block_dw_size = device->physical_device->rad_info.family == CHIP_HAWAII ? 4096 : 8192; - device->has_distributed_tess = - device->physical_device->rad_info.chip_class >= GFX8 && - device->physical_device->rad_info.max_se >= 2; if (getenv("RADV_TRACE_FILE")) { const char *filename = getenv("RADV_TRACE_FILE"); @@ -1996,8 +2780,12 @@ VkResult radv_CreateDevice( radv_dump_enabled_options(device, stderr); } - device->keep_shader_info = keep_shader_info; + /* Temporarily disable secure compile while we create meta shaders, etc */ + uint8_t sc_threads = device->instance->num_sc_threads; + if (sc_threads) + device->instance->num_sc_threads = 0; + device->keep_shader_info = keep_shader_info; result = radv_device_init_meta(device); if (result != VK_SUCCESS) goto fail; @@ -2037,6 +2825,10 @@ VkResult radv_CreateDevice( device->mem_cache = radv_pipeline_cache_from_handle(pc); + result = radv_create_pthread_cond(&device->timeline_cond); + if (result != VK_SUCCESS) + goto fail_mem_cache; + device->force_aniso = MIN2(16, radv_get_int_debug_option("RADV_TEX_ANISO", -1)); if (device->force_aniso >= 0) { @@ -2044,9 +2836,20 @@ VkResult radv_CreateDevice( 1 << util_logbase2(device->force_aniso)); } + /* Fork device for secure compile as required */ + device->instance->num_sc_threads = sc_threads; + if (radv_device_use_secure_compile(device->instance)) { + + result = fork_secure_compile_idle_device(device); + if (result != VK_SUCCESS) + goto fail_meta; + } + *pDevice = radv_device_to_handle(device); return VK_SUCCESS; +fail_mem_cache: + radv_DestroyPipelineCache(radv_device_to_handle(device), pc, NULL); fail_meta: radv_device_finish_meta(device); fail: @@ -2099,7 +2902,19 @@ void radv_DestroyDevice( radv_destroy_shader_slabs(device); + pthread_cond_destroy(&device->timeline_cond); radv_bo_list_finish(&device->bo_list); + if (radv_device_use_secure_compile(device->instance)) { + for (unsigned i = 0; i < device->instance->num_sc_threads; i++ ) { + destroy_secure_compile_device(device, i); + } + } + + if (device->sc_state) { + free(device->sc_state->uid); + vk_free(&device->alloc, device->sc_state->secure_compile_processes); + } + vk_free(&device->alloc, device->sc_state); vk_free(&device->alloc, device); } @@ -2477,9 +3292,28 @@ radv_emit_tess_factor_ring(struct radv_queue *queue, struct radeon_cmdbuf *cs, } } +static void +radv_emit_graphics_scratch(struct radv_queue *queue, struct radeon_cmdbuf *cs, + uint32_t size_per_wave, uint32_t waves, + struct radeon_winsys_bo *scratch_bo) +{ + if (queue->queue_family_index != RADV_QUEUE_GENERAL) + return; + + if (!scratch_bo) + return; + + radv_cs_add_buffer(queue->device->ws, cs, scratch_bo); + + radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, + S_0286E8_WAVES(waves) | + S_0286E8_WAVESIZE(round_up_u32(size_per_wave, 1024))); +} + static void radv_emit_compute_scratch(struct radv_queue *queue, struct radeon_cmdbuf *cs, - struct radeon_winsys_bo *compute_scratch_bo) + uint32_t size_per_wave, uint32_t waves, + struct radeon_winsys_bo *compute_scratch_bo) { uint64_t scratch_va; @@ -2494,6 +3328,10 @@ radv_emit_compute_scratch(struct radv_queue *queue, struct radeon_cmdbuf *cs, radeon_emit(cs, scratch_va); radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1)); + + radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE, + S_00B860_WAVES(waves) | + S_00B860_WAVESIZE(round_up_u32(size_per_wave, 1024))); } static void @@ -2574,11 +3412,14 @@ radv_init_compute_state(struct radeon_cmdbuf *cs, struct radv_queue *queue) static VkResult radv_get_preamble_cs(struct radv_queue *queue, - uint32_t scratch_size, - uint32_t compute_scratch_size, + uint32_t scratch_size_per_wave, + uint32_t scratch_waves, + uint32_t compute_scratch_size_per_wave, + uint32_t compute_scratch_waves, uint32_t esgs_ring_size, uint32_t gsvs_ring_size, bool needs_tess_rings, + bool needs_gds, bool needs_sample_positions, struct radeon_cmdbuf **initial_full_flush_preamble_cs, struct radeon_cmdbuf **initial_preamble_cs, @@ -2590,8 +3431,10 @@ radv_get_preamble_cs(struct radv_queue *queue, struct radeon_winsys_bo *esgs_ring_bo = NULL; struct radeon_winsys_bo *gsvs_ring_bo = NULL; struct radeon_winsys_bo *tess_rings_bo = NULL; + struct radeon_winsys_bo *gds_bo = NULL; + struct radeon_winsys_bo *gds_oa_bo = NULL; struct radeon_cmdbuf *dest_cs[3] = {0}; - bool add_tess_rings = false, add_sample_positions = false; + bool add_tess_rings = false, add_gds = false, add_sample_positions = false; unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0; unsigned max_offchip_buffers; unsigned hs_offchip_param = 0; @@ -2601,6 +3444,10 @@ radv_get_preamble_cs(struct radv_queue *queue, if (needs_tess_rings) add_tess_rings = true; } + if (!queue->has_gds) { + if (needs_gds) + add_gds = true; + } if (!queue->has_sample_positions) { if (needs_sample_positions) add_sample_positions = true; @@ -2612,21 +3459,39 @@ radv_get_preamble_cs(struct radv_queue *queue, tess_offchip_ring_size = max_offchip_buffers * queue->device->tess_offchip_block_dw_size * 4; - if (scratch_size <= queue->scratch_size && - compute_scratch_size <= queue->compute_scratch_size && + scratch_size_per_wave = MAX2(scratch_size_per_wave, queue->scratch_size_per_wave); + if (scratch_size_per_wave) + scratch_waves = MIN2(scratch_waves, UINT32_MAX / scratch_size_per_wave); + else + scratch_waves = 0; + + compute_scratch_size_per_wave = MAX2(compute_scratch_size_per_wave, queue->compute_scratch_size_per_wave); + if (compute_scratch_size_per_wave) + compute_scratch_waves = MIN2(compute_scratch_waves, UINT32_MAX / compute_scratch_size_per_wave); + else + compute_scratch_waves = 0; + + if (scratch_size_per_wave <= queue->scratch_size_per_wave && + scratch_waves <= queue->scratch_waves && + compute_scratch_size_per_wave <= queue->compute_scratch_size_per_wave && + compute_scratch_waves <= queue->compute_scratch_waves && esgs_ring_size <= queue->esgs_ring_size && gsvs_ring_size <= queue->gsvs_ring_size && - !add_tess_rings && !add_sample_positions && + !add_tess_rings && !add_gds && !add_sample_positions && queue->initial_preamble_cs) { *initial_full_flush_preamble_cs = queue->initial_full_flush_preamble_cs; *initial_preamble_cs = queue->initial_preamble_cs; *continue_preamble_cs = queue->continue_preamble_cs; - if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) + if (!scratch_size_per_wave && !compute_scratch_size_per_wave && + !esgs_ring_size && !gsvs_ring_size && !needs_tess_rings && + !needs_gds && !needs_sample_positions) *continue_preamble_cs = NULL; return VK_SUCCESS; } - if (scratch_size > queue->scratch_size) { + uint32_t scratch_size = scratch_size_per_wave * scratch_waves; + uint32_t queue_scratch_size = queue->scratch_size_per_wave * queue->scratch_waves; + if (scratch_size > queue_scratch_size) { scratch_bo = queue->device->ws->buffer_create(queue->device->ws, scratch_size, 4096, @@ -2638,7 +3503,9 @@ radv_get_preamble_cs(struct radv_queue *queue, } else scratch_bo = queue->scratch_bo; - if (compute_scratch_size > queue->compute_scratch_size) { + uint32_t compute_scratch_size = compute_scratch_size_per_wave * compute_scratch_waves; + uint32_t compute_queue_scratch_size = queue->compute_scratch_size_per_wave * queue->compute_scratch_waves; + if (compute_scratch_size > compute_queue_scratch_size) { compute_scratch_bo = queue->device->ws->buffer_create(queue->device->ws, compute_scratch_size, 4096, @@ -2692,6 +3559,32 @@ radv_get_preamble_cs(struct radv_queue *queue, tess_rings_bo = queue->tess_rings_bo; } + if (add_gds) { + assert(queue->device->physical_device->rad_info.chip_class >= GFX10); + + /* 4 streamout GDS counters. + * We need 256B (64 dw) of GDS, otherwise streamout hangs. + */ + gds_bo = queue->device->ws->buffer_create(queue->device->ws, + 256, 4, + RADEON_DOMAIN_GDS, + ring_bo_flags, + RADV_BO_PRIORITY_SCRATCH); + if (!gds_bo) + goto fail; + + gds_oa_bo = queue->device->ws->buffer_create(queue->device->ws, + 4, 1, + RADEON_DOMAIN_OA, + ring_bo_flags, + RADV_BO_PRIORITY_SCRATCH); + if (!gds_oa_bo) + goto fail; + } else { + gds_bo = queue->gds_bo; + gds_oa_bo = queue->gds_oa_bo; + } + if (scratch_bo != queue->scratch_bo || esgs_ring_bo != queue->esgs_ring_bo || gsvs_ring_bo != queue->gsvs_ring_bo || @@ -2780,7 +3673,15 @@ radv_get_preamble_cs(struct radv_queue *queue, radv_emit_tess_factor_ring(queue, cs, hs_offchip_param, tess_factor_ring_size, tess_rings_bo); radv_emit_global_shader_pointers(queue, cs, descriptor_bo); - radv_emit_compute_scratch(queue, cs, compute_scratch_bo); + radv_emit_compute_scratch(queue, cs, compute_scratch_size_per_wave, + compute_scratch_waves, compute_scratch_bo); + radv_emit_graphics_scratch(queue, cs, scratch_size_per_wave, + scratch_waves, scratch_bo); + + if (gds_bo) + radv_cs_add_buffer(queue->device->ws, cs, gds_bo); + if (gds_oa_bo) + radv_cs_add_buffer(queue->device->ws, cs, gds_oa_bo); if (i == 0) { si_cs_emit_cache_flush(cs, @@ -2828,15 +3729,17 @@ radv_get_preamble_cs(struct radv_queue *queue, if (queue->scratch_bo) queue->device->ws->buffer_destroy(queue->scratch_bo); queue->scratch_bo = scratch_bo; - queue->scratch_size = scratch_size; } + queue->scratch_size_per_wave = scratch_size_per_wave; + queue->scratch_waves = scratch_waves; if (compute_scratch_bo != queue->compute_scratch_bo) { if (queue->compute_scratch_bo) queue->device->ws->buffer_destroy(queue->compute_scratch_bo); queue->compute_scratch_bo = compute_scratch_bo; - queue->compute_scratch_size = compute_scratch_size; } + queue->compute_scratch_size_per_wave = compute_scratch_size_per_wave; + queue->compute_scratch_waves = compute_scratch_waves; if (esgs_ring_bo != queue->esgs_ring_bo) { if (queue->esgs_ring_bo) @@ -2857,6 +3760,14 @@ radv_get_preamble_cs(struct radv_queue *queue, queue->has_tess_rings = true; } + if (gds_bo != queue->gds_bo) { + queue->gds_bo = gds_bo; + queue->has_gds = true; + } + + if (gds_oa_bo != queue->gds_oa_bo) + queue->gds_oa_bo = gds_oa_bo; + if (descriptor_bo != queue->descriptor_bo) { if (queue->descriptor_bo) queue->device->ws->buffer_destroy(queue->descriptor_bo); @@ -2889,15 +3800,21 @@ fail: queue->device->ws->buffer_destroy(gsvs_ring_bo); if (tess_rings_bo && tess_rings_bo != queue->tess_rings_bo) queue->device->ws->buffer_destroy(tess_rings_bo); + if (gds_bo && gds_bo != queue->gds_bo) + queue->device->ws->buffer_destroy(gds_bo); + if (gds_oa_bo && gds_oa_bo != queue->gds_oa_bo) + queue->device->ws->buffer_destroy(gds_oa_bo); + return vk_error(queue->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); } -static VkResult radv_alloc_sem_counts(struct radv_instance *instance, +static VkResult radv_alloc_sem_counts(struct radv_device *device, struct radv_winsys_sem_counts *counts, int num_sems, - const VkSemaphore *sems, + struct radv_semaphore_part **sems, + const uint64_t *timeline_values, VkFence _fence, - bool reset_temp) + bool is_signal) { int syncobj_idx = 0, sem_idx = 0; @@ -2905,12 +3822,19 @@ static VkResult radv_alloc_sem_counts(struct radv_instance *instance, return VK_SUCCESS; for (uint32_t i = 0; i < num_sems; i++) { - RADV_FROM_HANDLE(radv_semaphore, sem, sems[i]); - - if (sem->temp_syncobj || sem->syncobj) + switch(sems[i]->kind) { + case RADV_SEMAPHORE_SYNCOBJ: counts->syncobj_count++; - else + break; + case RADV_SEMAPHORE_WINSYS: counts->sem_count++; + break; + case RADV_SEMAPHORE_NONE: + break; + case RADV_SEMAPHORE_TIMELINE: + counts->syncobj_count++; + break; + } } if (_fence != VK_NULL_HANDLE) { @@ -2922,28 +3846,48 @@ static VkResult radv_alloc_sem_counts(struct radv_instance *instance, if (counts->syncobj_count) { counts->syncobj = (uint32_t *)malloc(sizeof(uint32_t) * counts->syncobj_count); if (!counts->syncobj) - return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } if (counts->sem_count) { counts->sem = (struct radeon_winsys_sem **)malloc(sizeof(struct radeon_winsys_sem *) * counts->sem_count); if (!counts->sem) { free(counts->syncobj); - return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } } for (uint32_t i = 0; i < num_sems; i++) { - RADV_FROM_HANDLE(radv_semaphore, sem, sems[i]); + switch(sems[i]->kind) { + case RADV_SEMAPHORE_NONE: + unreachable("Empty semaphore"); + break; + case RADV_SEMAPHORE_SYNCOBJ: + counts->syncobj[syncobj_idx++] = sems[i]->syncobj; + break; + case RADV_SEMAPHORE_WINSYS: + counts->sem[sem_idx++] = sems[i]->ws_sem; + break; + case RADV_SEMAPHORE_TIMELINE: { + pthread_mutex_lock(&sems[i]->timeline.mutex); + struct radv_timeline_point *point = NULL; + if (is_signal) { + point = radv_timeline_add_point_locked(device, &sems[i]->timeline, timeline_values[i]); + } else { + point = radv_timeline_find_point_at_least_locked(device, &sems[i]->timeline, timeline_values[i]); + } - if (sem->temp_syncobj) { - counts->syncobj[syncobj_idx++] = sem->temp_syncobj; + pthread_mutex_unlock(&sems[i]->timeline.mutex); + + if (point) { + counts->syncobj[syncobj_idx++] = point->syncobj; + } else { + /* Explicitly remove the semaphore so we might not find + * a point later post-submit. */ + sems[i] = NULL; + } + break; } - else if (sem->syncobj) - counts->syncobj[syncobj_idx++] = sem->syncobj; - else { - assert(sem->sem); - counts->sem[sem_idx++] = sem->sem; } } @@ -2955,6 +3899,9 @@ static VkResult radv_alloc_sem_counts(struct radv_instance *instance, counts->syncobj[syncobj_idx++] = fence->syncobj; } + assert(syncobj_idx <= counts->syncobj_count); + counts->syncobj_count = syncobj_idx; + return VK_SUCCESS; } @@ -2970,34 +3917,31 @@ radv_free_sem_info(struct radv_winsys_sem_info *sem_info) static void radv_free_temp_syncobjs(struct radv_device *device, int num_sems, - const VkSemaphore *sems) + struct radv_semaphore_part *sems) { for (uint32_t i = 0; i < num_sems; i++) { - RADV_FROM_HANDLE(radv_semaphore, sem, sems[i]); - - if (sem->temp_syncobj) { - device->ws->destroy_syncobj(device->ws, sem->temp_syncobj); - sem->temp_syncobj = 0; - } + radv_destroy_semaphore_part(device, sems + i); } } static VkResult -radv_alloc_sem_info(struct radv_instance *instance, +radv_alloc_sem_info(struct radv_device *device, struct radv_winsys_sem_info *sem_info, int num_wait_sems, - const VkSemaphore *wait_sems, + struct radv_semaphore_part **wait_sems, + const uint64_t *wait_values, int num_signal_sems, - const VkSemaphore *signal_sems, + struct radv_semaphore_part **signal_sems, + const uint64_t *signal_values, VkFence fence) { VkResult ret; memset(sem_info, 0, sizeof(*sem_info)); - ret = radv_alloc_sem_counts(instance, &sem_info->wait, num_wait_sems, wait_sems, VK_NULL_HANDLE, true); + ret = radv_alloc_sem_counts(device, &sem_info->wait, num_wait_sems, wait_sems, wait_values, VK_NULL_HANDLE, false); if (ret) return ret; - ret = radv_alloc_sem_counts(instance, &sem_info->signal, num_signal_sems, signal_sems, fence, false); + ret = radv_alloc_sem_counts(device, &sem_info->signal, num_signal_sems, signal_sems, signal_values, fence, true); if (ret) radv_free_sem_info(sem_info); @@ -3007,116 +3951,398 @@ radv_alloc_sem_info(struct radv_instance *instance, return ret; } -/* Signals fence as soon as all the work currently put on queue is done. */ -static VkResult radv_signal_fence(struct radv_queue *queue, - struct radv_fence *fence) -{ - int ret; - VkResult result; - struct radv_winsys_sem_info sem_info; +static void +radv_finalize_timelines(struct radv_device *device, + uint32_t num_wait_sems, + struct radv_semaphore_part **wait_sems, + const uint64_t *wait_values, + uint32_t num_signal_sems, + struct radv_semaphore_part **signal_sems, + const uint64_t *signal_values, + struct list_head *processing_list) +{ + for (uint32_t i = 0; i < num_wait_sems; ++i) { + if (wait_sems[i] && wait_sems[i]->kind == RADV_SEMAPHORE_TIMELINE) { + pthread_mutex_lock(&wait_sems[i]->timeline.mutex); + struct radv_timeline_point *point = + radv_timeline_find_point_at_least_locked(device, &wait_sems[i]->timeline, wait_values[i]); + if (point) + --point->wait_count; + pthread_mutex_unlock(&wait_sems[i]->timeline.mutex); + } + } + for (uint32_t i = 0; i < num_signal_sems; ++i) { + if (signal_sems[i] && signal_sems[i]->kind == RADV_SEMAPHORE_TIMELINE) { + pthread_mutex_lock(&signal_sems[i]->timeline.mutex); + struct radv_timeline_point *point = + radv_timeline_find_point_at_least_locked(device, &signal_sems[i]->timeline, signal_values[i]); + if (point) { + signal_sems[i]->timeline.highest_submitted = + MAX2(signal_sems[i]->timeline.highest_submitted, point->value); + point->wait_count--; + } + radv_timeline_trigger_waiters_locked(&signal_sems[i]->timeline, processing_list); + pthread_mutex_unlock(&signal_sems[i]->timeline.mutex); + } + } +} - result = radv_alloc_sem_info(queue->device->instance, &sem_info, 0, NULL, 0, NULL, - radv_fence_to_handle(fence)); - if (result != VK_SUCCESS) - return result; +static void +radv_sparse_buffer_bind_memory(struct radv_device *device, + const VkSparseBufferMemoryBindInfo *bind) +{ + RADV_FROM_HANDLE(radv_buffer, buffer, bind->buffer); - ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx, - &queue->device->empty_cs[queue->queue_family_index], - 1, NULL, NULL, &sem_info, NULL, - false, fence->fence); - radv_free_sem_info(&sem_info); + for (uint32_t i = 0; i < bind->bindCount; ++i) { + struct radv_device_memory *mem = NULL; - if (ret) - return vk_error(queue->device->instance, VK_ERROR_DEVICE_LOST); + if (bind->pBinds[i].memory != VK_NULL_HANDLE) + mem = radv_device_memory_from_handle(bind->pBinds[i].memory); - return VK_SUCCESS; + device->ws->buffer_virtual_bind(buffer->bo, + bind->pBinds[i].resourceOffset, + bind->pBinds[i].size, + mem ? mem->bo : NULL, + bind->pBinds[i].memoryOffset); + } } -VkResult radv_QueueSubmit( - VkQueue _queue, - uint32_t submitCount, - const VkSubmitInfo* pSubmits, - VkFence _fence) +static void +radv_sparse_image_opaque_bind_memory(struct radv_device *device, + const VkSparseImageOpaqueMemoryBindInfo *bind) { - RADV_FROM_HANDLE(radv_queue, queue, _queue); - RADV_FROM_HANDLE(radv_fence, fence, _fence); - struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL; - struct radeon_winsys_ctx *ctx = queue->hw_ctx; - int ret; - uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT; - uint32_t scratch_size = 0; - uint32_t compute_scratch_size = 0; + RADV_FROM_HANDLE(radv_image, image, bind->image); + + for (uint32_t i = 0; i < bind->bindCount; ++i) { + struct radv_device_memory *mem = NULL; + + if (bind->pBinds[i].memory != VK_NULL_HANDLE) + mem = radv_device_memory_from_handle(bind->pBinds[i].memory); + + device->ws->buffer_virtual_bind(image->bo, + bind->pBinds[i].resourceOffset, + bind->pBinds[i].size, + mem ? mem->bo : NULL, + bind->pBinds[i].memoryOffset); + } +} + +static VkResult +radv_get_preambles(struct radv_queue *queue, + const VkCommandBuffer *cmd_buffers, + uint32_t cmd_buffer_count, + struct radeon_cmdbuf **initial_full_flush_preamble_cs, + struct radeon_cmdbuf **initial_preamble_cs, + struct radeon_cmdbuf **continue_preamble_cs) +{ + uint32_t scratch_size_per_wave = 0, waves_wanted = 0; + uint32_t compute_scratch_size_per_wave = 0, compute_waves_wanted = 0; uint32_t esgs_ring_size = 0, gsvs_ring_size = 0; - struct radeon_cmdbuf *initial_preamble_cs = NULL, *initial_flush_preamble_cs = NULL, *continue_preamble_cs = NULL; - VkResult result; - bool fence_emitted = false; bool tess_rings_needed = false; + bool gds_needed = false; bool sample_positions_needed = false; - /* Do this first so failing to allocate scratch buffers can't result in - * partially executed submissions. */ - for (uint32_t i = 0; i < submitCount; i++) { - for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { - RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, - pSubmits[i].pCommandBuffers[j]); + for (uint32_t j = 0; j < cmd_buffer_count; j++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, + cmd_buffers[j]); + + scratch_size_per_wave = MAX2(scratch_size_per_wave, cmd_buffer->scratch_size_per_wave_needed); + waves_wanted = MAX2(waves_wanted, cmd_buffer->scratch_waves_wanted); + compute_scratch_size_per_wave = MAX2(compute_scratch_size_per_wave, + cmd_buffer->compute_scratch_size_per_wave_needed); + compute_waves_wanted = MAX2(compute_waves_wanted, + cmd_buffer->compute_scratch_waves_wanted); + esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed); + gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed); + tess_rings_needed |= cmd_buffer->tess_rings_needed; + gds_needed |= cmd_buffer->gds_needed; + sample_positions_needed |= cmd_buffer->sample_positions_needed; + } + + return radv_get_preamble_cs(queue, scratch_size_per_wave, waves_wanted, + compute_scratch_size_per_wave, compute_waves_wanted, + esgs_ring_size, gsvs_ring_size, tess_rings_needed, + gds_needed, sample_positions_needed, + initial_full_flush_preamble_cs, + initial_preamble_cs, continue_preamble_cs); +} - scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed); - compute_scratch_size = MAX2(compute_scratch_size, - cmd_buffer->compute_scratch_size_needed); - esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed); - gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed); - tess_rings_needed |= cmd_buffer->tess_rings_needed; - sample_positions_needed |= cmd_buffer->sample_positions_needed; +struct radv_deferred_queue_submission { + struct radv_queue *queue; + VkCommandBuffer *cmd_buffers; + uint32_t cmd_buffer_count; + + /* Sparse bindings that happen on a queue. */ + VkSparseBufferMemoryBindInfo *buffer_binds; + uint32_t buffer_bind_count; + VkSparseImageOpaqueMemoryBindInfo *image_opaque_binds; + uint32_t image_opaque_bind_count; + + bool flush_caches; + VkShaderStageFlags wait_dst_stage_mask; + struct radv_semaphore_part **wait_semaphores; + uint32_t wait_semaphore_count; + struct radv_semaphore_part **signal_semaphores; + uint32_t signal_semaphore_count; + VkFence fence; + + uint64_t *wait_values; + uint64_t *signal_values; + + struct radv_semaphore_part *temporary_semaphore_parts; + uint32_t temporary_semaphore_part_count; + + struct list_head queue_pending_list; + uint32_t submission_wait_count; + struct radv_timeline_waiter *wait_nodes; + + struct list_head processing_list; +}; + +struct radv_queue_submission { + const VkCommandBuffer *cmd_buffers; + uint32_t cmd_buffer_count; + + /* Sparse bindings that happen on a queue. */ + const VkSparseBufferMemoryBindInfo *buffer_binds; + uint32_t buffer_bind_count; + const VkSparseImageOpaqueMemoryBindInfo *image_opaque_binds; + uint32_t image_opaque_bind_count; + + bool flush_caches; + VkPipelineStageFlags wait_dst_stage_mask; + const VkSemaphore *wait_semaphores; + uint32_t wait_semaphore_count; + const VkSemaphore *signal_semaphores; + uint32_t signal_semaphore_count; + VkFence fence; + + const uint64_t *wait_values; + uint32_t wait_value_count; + const uint64_t *signal_values; + uint32_t signal_value_count; +}; + +static VkResult +radv_create_deferred_submission(struct radv_queue *queue, + const struct radv_queue_submission *submission, + struct radv_deferred_queue_submission **out) +{ + struct radv_deferred_queue_submission *deferred = NULL; + size_t size = sizeof(struct radv_deferred_queue_submission); + + uint32_t temporary_count = 0; + for (uint32_t i = 0; i < submission->wait_semaphore_count; ++i) { + RADV_FROM_HANDLE(radv_semaphore, semaphore, submission->wait_semaphores[i]); + if (semaphore->temporary.kind != RADV_SEMAPHORE_NONE) + ++temporary_count; + } + + size += submission->cmd_buffer_count * sizeof(VkCommandBuffer); + size += submission->buffer_bind_count * sizeof(VkSparseBufferMemoryBindInfo); + size += submission->image_opaque_bind_count * sizeof(VkSparseImageOpaqueMemoryBindInfo); + size += submission->wait_semaphore_count * sizeof(struct radv_semaphore_part *); + size += temporary_count * sizeof(struct radv_semaphore_part); + size += submission->signal_semaphore_count * sizeof(struct radv_semaphore_part *); + size += submission->wait_value_count * sizeof(uint64_t); + size += submission->signal_value_count * sizeof(uint64_t); + size += submission->wait_semaphore_count * sizeof(struct radv_timeline_waiter); + + deferred = calloc(1, size); + if (!deferred) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + deferred->queue = queue; + + deferred->cmd_buffers = (void*)(deferred + 1); + deferred->cmd_buffer_count = submission->cmd_buffer_count; + memcpy(deferred->cmd_buffers, submission->cmd_buffers, + submission->cmd_buffer_count * sizeof(*deferred->cmd_buffers)); + + deferred->buffer_binds = (void*)(deferred->cmd_buffers + submission->cmd_buffer_count); + deferred->buffer_bind_count = submission->buffer_bind_count; + memcpy(deferred->buffer_binds, submission->buffer_binds, + submission->buffer_bind_count * sizeof(*deferred->buffer_binds)); + + deferred->image_opaque_binds = (void*)(deferred->buffer_binds + submission->buffer_bind_count); + deferred->image_opaque_bind_count = submission->image_opaque_bind_count; + memcpy(deferred->image_opaque_binds, submission->image_opaque_binds, + submission->image_opaque_bind_count * sizeof(*deferred->image_opaque_binds)); + + deferred->flush_caches = submission->flush_caches; + deferred->wait_dst_stage_mask = submission->wait_dst_stage_mask; + + deferred->wait_semaphores = (void*)(deferred->image_opaque_binds + deferred->image_opaque_bind_count); + deferred->wait_semaphore_count = submission->wait_semaphore_count; + + deferred->signal_semaphores = (void*)(deferred->wait_semaphores + deferred->wait_semaphore_count); + deferred->signal_semaphore_count = submission->signal_semaphore_count; + + deferred->fence = submission->fence; + + deferred->temporary_semaphore_parts = (void*)(deferred->signal_semaphores + deferred->signal_semaphore_count); + deferred->temporary_semaphore_part_count = temporary_count; + + uint32_t temporary_idx = 0; + for (uint32_t i = 0; i < submission->wait_semaphore_count; ++i) { + RADV_FROM_HANDLE(radv_semaphore, semaphore, submission->wait_semaphores[i]); + if (semaphore->temporary.kind != RADV_SEMAPHORE_NONE) { + deferred->wait_semaphores[i] = &deferred->temporary_semaphore_parts[temporary_idx]; + deferred->temporary_semaphore_parts[temporary_idx] = semaphore->temporary; + semaphore->temporary.kind = RADV_SEMAPHORE_NONE; + ++temporary_idx; + } else + deferred->wait_semaphores[i] = &semaphore->permanent; + } + + for (uint32_t i = 0; i < submission->signal_semaphore_count; ++i) { + RADV_FROM_HANDLE(radv_semaphore, semaphore, submission->signal_semaphores[i]); + if (semaphore->temporary.kind != RADV_SEMAPHORE_NONE) { + deferred->signal_semaphores[i] = &semaphore->temporary; + } else { + deferred->signal_semaphores[i] = &semaphore->permanent; } } - result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, - esgs_ring_size, gsvs_ring_size, tess_rings_needed, - sample_positions_needed, &initial_flush_preamble_cs, - &initial_preamble_cs, &continue_preamble_cs); - if (result != VK_SUCCESS) - return result; + deferred->wait_values = (void*)(deferred->temporary_semaphore_parts + temporary_count); + memcpy(deferred->wait_values, submission->wait_values, submission->wait_value_count * sizeof(uint64_t)); + deferred->signal_values = deferred->wait_values + submission->wait_value_count; + memcpy(deferred->signal_values, submission->signal_values, submission->signal_value_count * sizeof(uint64_t)); - for (uint32_t i = 0; i < submitCount; i++) { - struct radeon_cmdbuf **cs_array; - bool do_flush = !i || pSubmits[i].pWaitDstStageMask; - bool can_patch = true; - uint32_t advance; - struct radv_winsys_sem_info sem_info; - - result = radv_alloc_sem_info(queue->device->instance, - &sem_info, - pSubmits[i].waitSemaphoreCount, - pSubmits[i].pWaitSemaphores, - pSubmits[i].signalSemaphoreCount, - pSubmits[i].pSignalSemaphores, - _fence); - if (result != VK_SUCCESS) - return result; + deferred->wait_nodes = (void*)(deferred->signal_values + submission->signal_value_count); + /* This is worst-case. radv_queue_enqueue_submission will fill in further, but this + * ensure the submission is not accidentally triggered early when adding wait timelines. */ + deferred->submission_wait_count = 1 + submission->wait_semaphore_count; - if (!pSubmits[i].commandBufferCount) { - if (pSubmits[i].waitSemaphoreCount || pSubmits[i].signalSemaphoreCount) { - ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, - &queue->device->empty_cs[queue->queue_family_index], - 1, NULL, NULL, - &sem_info, NULL, - false, base_fence); - if (ret) { - radv_loge("failed to submit CS %d\n", i); - abort(); - } - fence_emitted = true; + *out = deferred; + return VK_SUCCESS; +} + +static void +radv_queue_enqueue_submission(struct radv_deferred_queue_submission *submission, + struct list_head *processing_list) +{ + uint32_t wait_cnt = 0; + struct radv_timeline_waiter *waiter = submission->wait_nodes; + for (uint32_t i = 0; i < submission->wait_semaphore_count; ++i) { + if (submission->wait_semaphores[i]->kind == RADV_SEMAPHORE_TIMELINE) { + pthread_mutex_lock(&submission->wait_semaphores[i]->timeline.mutex); + if (submission->wait_semaphores[i]->timeline.highest_submitted < submission->wait_values[i]) { + ++wait_cnt; + waiter->value = submission->wait_values[i]; + waiter->submission = submission; + list_addtail(&waiter->list, &submission->wait_semaphores[i]->timeline.waiters); + ++waiter; } - radv_free_sem_info(&sem_info); - continue; + pthread_mutex_unlock(&submission->wait_semaphores[i]->timeline.mutex); + } + } + + pthread_mutex_lock(&submission->queue->pending_mutex); + + bool is_first = list_is_empty(&submission->queue->pending_submissions); + list_addtail(&submission->queue_pending_list, &submission->queue->pending_submissions); + + pthread_mutex_unlock(&submission->queue->pending_mutex); + + /* If there is already a submission in the queue, that will decrement the counter by 1 when + * submitted, but if the queue was empty, we decrement ourselves as there is no previous + * submission. */ + uint32_t decrement = submission->wait_semaphore_count - wait_cnt + (is_first ? 1 : 0); + if (__atomic_sub_fetch(&submission->submission_wait_count, decrement, __ATOMIC_ACQ_REL) == 0) { + list_addtail(&submission->processing_list, processing_list); + } +} + +static void +radv_queue_submission_update_queue(struct radv_deferred_queue_submission *submission, + struct list_head *processing_list) +{ + pthread_mutex_lock(&submission->queue->pending_mutex); + list_del(&submission->queue_pending_list); + + /* trigger the next submission in the queue. */ + if (!list_is_empty(&submission->queue->pending_submissions)) { + struct radv_deferred_queue_submission *next_submission = + list_first_entry(&submission->queue->pending_submissions, + struct radv_deferred_queue_submission, + queue_pending_list); + if (p_atomic_dec_zero(&next_submission->submission_wait_count)) { + list_addtail(&next_submission->processing_list, processing_list); + } + } + pthread_mutex_unlock(&submission->queue->pending_mutex); + + pthread_cond_broadcast(&submission->queue->device->timeline_cond); +} + +static VkResult +radv_queue_submit_deferred(struct radv_deferred_queue_submission *submission, + struct list_head *processing_list) +{ + RADV_FROM_HANDLE(radv_fence, fence, submission->fence); + struct radv_queue *queue = submission->queue; + struct radeon_winsys_ctx *ctx = queue->hw_ctx; + uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT; + struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL; + bool do_flush = submission->flush_caches || submission->wait_dst_stage_mask; + bool can_patch = true; + uint32_t advance; + struct radv_winsys_sem_info sem_info; + VkResult result; + int ret; + struct radeon_cmdbuf *initial_preamble_cs = NULL; + struct radeon_cmdbuf *initial_flush_preamble_cs = NULL; + struct radeon_cmdbuf *continue_preamble_cs = NULL; + + result = radv_get_preambles(queue, submission->cmd_buffers, + submission->cmd_buffer_count, + &initial_preamble_cs, + &initial_flush_preamble_cs, + &continue_preamble_cs); + if (result != VK_SUCCESS) + goto fail; + + result = radv_alloc_sem_info(queue->device, + &sem_info, + submission->wait_semaphore_count, + submission->wait_semaphores, + submission->wait_values, + submission->signal_semaphore_count, + submission->signal_semaphores, + submission->signal_values, + submission->fence); + if (result != VK_SUCCESS) + goto fail; + + for (uint32_t i = 0; i < submission->buffer_bind_count; ++i) { + radv_sparse_buffer_bind_memory(queue->device, + submission->buffer_binds + i); + } + + for (uint32_t i = 0; i < submission->image_opaque_bind_count; ++i) { + radv_sparse_image_opaque_bind_memory(queue->device, + submission->image_opaque_binds + i); + } + + if (!submission->cmd_buffer_count) { + ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, + &queue->device->empty_cs[queue->queue_family_index], + 1, NULL, NULL, + &sem_info, NULL, + false, base_fence); + if (ret) { + radv_loge("failed to submit CS\n"); + abort(); } - cs_array = malloc(sizeof(struct radeon_cmdbuf *) * - (pSubmits[i].commandBufferCount)); + goto success; + } else { + struct radeon_cmdbuf **cs_array = malloc(sizeof(struct radeon_cmdbuf *) * + (submission->cmd_buffer_count)); - for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { - RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, - pSubmits[i].pCommandBuffers[j]); + for (uint32_t j = 0; j < submission->cmd_buffer_count; j++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, submission->cmd_buffers[j]); assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); cs_array[j] = cmd_buffer->cs; @@ -3126,18 +4352,18 @@ VkResult radv_QueueSubmit( cmd_buffer->status = RADV_CMD_BUFFER_STATUS_PENDING; } - for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) { + for (uint32_t j = 0; j < submission->cmd_buffer_count; j += advance) { struct radeon_cmdbuf *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs; const struct radv_winsys_bo_list *bo_list = NULL; advance = MIN2(max_cs_submission, - pSubmits[i].commandBufferCount - j); + submission->cmd_buffer_count - j); if (queue->device->trace_bo) *queue->device->trace_id_ptr = 0; sem_info.cs_emit_wait = j == 0; - sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount; + sem_info.cs_emit_signal = j + advance == submission->cmd_buffer_count; if (unlikely(queue->device->use_global_bo_list)) { pthread_mutex_lock(&queue->device->bo_list.mutex); @@ -3145,36 +4371,155 @@ VkResult radv_QueueSubmit( } ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, - advance, initial_preamble, continue_preamble_cs, - &sem_info, bo_list, - can_patch, base_fence); + advance, initial_preamble, continue_preamble_cs, + &sem_info, bo_list, + can_patch, base_fence); if (unlikely(queue->device->use_global_bo_list)) pthread_mutex_unlock(&queue->device->bo_list.mutex); if (ret) { - radv_loge("failed to submit CS %d\n", i); + radv_loge("failed to submit CS\n"); abort(); } - fence_emitted = true; if (queue->device->trace_bo) { radv_check_gpu_hangs(queue, cs_array[j]); } } - radv_free_temp_syncobjs(queue->device, - pSubmits[i].waitSemaphoreCount, - pSubmits[i].pWaitSemaphores); - radv_free_sem_info(&sem_info); - free(cs_array); + free(cs_array); + } + +success: + radv_free_temp_syncobjs(queue->device, + submission->temporary_semaphore_part_count, + submission->temporary_semaphore_parts); + radv_finalize_timelines(queue->device, + submission->wait_semaphore_count, + submission->wait_semaphores, + submission->wait_values, + submission->signal_semaphore_count, + submission->signal_semaphores, + submission->signal_values, + processing_list); + /* Has to happen after timeline finalization to make sure the + * condition variable is only triggered when timelines and queue have + * been updated. */ + radv_queue_submission_update_queue(submission, processing_list); + radv_free_sem_info(&sem_info); + free(submission); + return VK_SUCCESS; + +fail: + radv_free_temp_syncobjs(queue->device, + submission->temporary_semaphore_part_count, + submission->temporary_semaphore_parts); + free(submission); + return VK_ERROR_DEVICE_LOST; +} + +static VkResult +radv_process_submissions(struct list_head *processing_list) +{ + while(!list_is_empty(processing_list)) { + struct radv_deferred_queue_submission *submission = + list_first_entry(processing_list, struct radv_deferred_queue_submission, processing_list); + list_del(&submission->processing_list); + + VkResult result = radv_queue_submit_deferred(submission, processing_list); + if (result != VK_SUCCESS) + return result; + } + return VK_SUCCESS; +} + +static VkResult radv_queue_submit(struct radv_queue *queue, + const struct radv_queue_submission *submission) +{ + struct radv_deferred_queue_submission *deferred = NULL; + + VkResult result = radv_create_deferred_submission(queue, submission, &deferred); + if (result != VK_SUCCESS) + return result; + + struct list_head processing_list; + list_inithead(&processing_list); + + radv_queue_enqueue_submission(deferred, &processing_list); + return radv_process_submissions(&processing_list); +} + +/* Signals fence as soon as all the work currently put on queue is done. */ +static VkResult radv_signal_fence(struct radv_queue *queue, + VkFence fence) +{ + return radv_queue_submit(queue, &(struct radv_queue_submission) { + .fence = fence + }); +} + +static bool radv_submit_has_effects(const VkSubmitInfo *info) +{ + return info->commandBufferCount || + info->waitSemaphoreCount || + info->signalSemaphoreCount; +} + +VkResult radv_QueueSubmit( + VkQueue _queue, + uint32_t submitCount, + const VkSubmitInfo* pSubmits, + VkFence fence) +{ + RADV_FROM_HANDLE(radv_queue, queue, _queue); + VkResult result; + uint32_t fence_idx = 0; + bool flushed_caches = false; + + if (fence != VK_NULL_HANDLE) { + for (uint32_t i = 0; i < submitCount; ++i) + if (radv_submit_has_effects(pSubmits + i)) + fence_idx = i; + } else + fence_idx = UINT32_MAX; + + for (uint32_t i = 0; i < submitCount; i++) { + if (!radv_submit_has_effects(pSubmits + i) && fence_idx != i) + continue; + + VkPipelineStageFlags wait_dst_stage_mask = 0; + for (unsigned j = 0; j < pSubmits[i].waitSemaphoreCount; ++j) { + wait_dst_stage_mask |= pSubmits[i].pWaitDstStageMask[j]; + } + + const VkTimelineSemaphoreSubmitInfoKHR *timeline_info = + vk_find_struct_const(pSubmits[i].pNext, TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR); + + result = radv_queue_submit(queue, &(struct radv_queue_submission) { + .cmd_buffers = pSubmits[i].pCommandBuffers, + .cmd_buffer_count = pSubmits[i].commandBufferCount, + .wait_dst_stage_mask = wait_dst_stage_mask, + .flush_caches = !flushed_caches, + .wait_semaphores = pSubmits[i].pWaitSemaphores, + .wait_semaphore_count = pSubmits[i].waitSemaphoreCount, + .signal_semaphores = pSubmits[i].pSignalSemaphores, + .signal_semaphore_count = pSubmits[i].signalSemaphoreCount, + .fence = i == fence_idx ? fence : VK_NULL_HANDLE, + .wait_values = timeline_info ? timeline_info->pWaitSemaphoreValues : NULL, + .wait_value_count = timeline_info && timeline_info->pWaitSemaphoreValues ? timeline_info->waitSemaphoreValueCount : 0, + .signal_values = timeline_info ? timeline_info->pSignalSemaphoreValues : NULL, + .signal_value_count = timeline_info && timeline_info->pSignalSemaphoreValues ? timeline_info->signalSemaphoreValueCount : 0, + }); + if (result != VK_SUCCESS) + return result; + + flushed_caches = true; } - if (fence) { - if (!fence_emitted) { - result = radv_signal_fence(queue, fence); - if (result != VK_SUCCESS) - return result; - } + if (fence != VK_NULL_HANDLE && !submitCount) { + result = radv_signal_fence(queue, fence); + if (result != VK_SUCCESS) + return result; } return VK_SUCCESS; @@ -3185,6 +4530,12 @@ VkResult radv_QueueWaitIdle( { RADV_FROM_HANDLE(radv_queue, queue, _queue); + pthread_mutex_lock(&queue->pending_mutex); + while (!list_is_empty(&queue->pending_submissions)) { + pthread_cond_wait(&queue->device->timeline_cond, &queue->pending_mutex); + } + pthread_mutex_unlock(&queue->pending_mutex); + queue->device->ws->ctx_wait_idle(queue->hw_ctx, radv_queue_family_to_ring(queue->queue_family_index), queue->queue_idx); @@ -3247,11 +4598,16 @@ PFN_vkVoidFunction radv_GetInstanceProcAddr( const char* pName) { RADV_FROM_HANDLE(radv_instance, instance, _instance); + bool unchecked = instance ? instance->debug_flags & RADV_DEBUG_ALL_ENTRYPOINTS : false; - return radv_lookup_entrypoint_checked(pName, - instance ? instance->apiVersion : 0, - instance ? &instance->enabled_extensions : NULL, - NULL); + if (unchecked) { + return radv_lookup_entrypoint_unchecked(pName); + } else { + return radv_lookup_entrypoint_checked(pName, + instance ? instance->apiVersion : 0, + instance ? &instance->enabled_extensions : NULL, + NULL); + } } /* The loader wants us to expose a second GetInstanceProcAddr function @@ -3292,11 +4648,16 @@ PFN_vkVoidFunction radv_GetDeviceProcAddr( const char* pName) { RADV_FROM_HANDLE(radv_device, device, _device); + bool unchecked = device ? device->instance->debug_flags & RADV_DEBUG_ALL_ENTRYPOINTS : false; - return radv_lookup_entrypoint_checked(pName, - device->instance->apiVersion, - &device->instance->enabled_extensions, - &device->enabled_extensions); + if (unchecked) { + return radv_lookup_entrypoint_unchecked(pName); + } else { + return radv_lookup_entrypoint_checked(pName, + device->instance->apiVersion, + &device->instance->enabled_extensions, + &device->enabled_extensions); + } } bool radv_get_memory_fd(struct radv_device *device, @@ -3314,6 +4675,28 @@ bool radv_get_memory_fd(struct radv_device *device, pFD); } + +static void radv_free_memory(struct radv_device *device, + const VkAllocationCallbacks* pAllocator, + struct radv_device_memory *mem) +{ + if (mem == NULL) + return; + +#if RADV_SUPPORT_ANDROID_HARDWARE_BUFFER + if (mem->android_hardware_buffer) + AHardwareBuffer_release(mem->android_hardware_buffer); +#endif + + if (mem->bo) { + radv_bo_list_remove(device, mem->bo); + device->ws->buffer_destroy(mem->bo); + mem->bo = NULL; + } + + vk_free2(&device->alloc, pAllocator, mem); +} + static VkResult radv_alloc_memory(struct radv_device *device, const VkMemoryAllocateInfo* pAllocateInfo, const VkAllocationCallbacks* pAllocator, @@ -3327,25 +4710,29 @@ static VkResult radv_alloc_memory(struct radv_device *device, assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO); - if (pAllocateInfo->allocationSize == 0) { - /* Apparently, this is allowed */ - *pMem = VK_NULL_HANDLE; - return VK_SUCCESS; - } - const VkImportMemoryFdInfoKHR *import_info = vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR); const VkMemoryDedicatedAllocateInfo *dedicate_info = vk_find_struct_const(pAllocateInfo->pNext, MEMORY_DEDICATED_ALLOCATE_INFO); const VkExportMemoryAllocateInfo *export_info = vk_find_struct_const(pAllocateInfo->pNext, EXPORT_MEMORY_ALLOCATE_INFO); + const struct VkImportAndroidHardwareBufferInfoANDROID *ahb_import_info = + vk_find_struct_const(pAllocateInfo->pNext, + IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID); const VkImportMemoryHostPointerInfoEXT *host_ptr_info = vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_HOST_POINTER_INFO_EXT); const struct wsi_memory_allocate_info *wsi_info = vk_find_struct_const(pAllocateInfo->pNext, WSI_MEMORY_ALLOCATE_INFO_MESA); - mem = vk_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8, + if (pAllocateInfo->allocationSize == 0 && !ahb_import_info && + !(export_info && (export_info->handleTypes & VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID))) { + /* Apparently, this is allowed */ + *pMem = VK_NULL_HANDLE; + return VK_SUCCESS; + } + + mem = vk_zalloc2(&device->alloc, pAllocator, sizeof(*mem), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (mem == NULL) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); @@ -3372,14 +4759,27 @@ static VkResult radv_alloc_memory(struct radv_device *device, (int)(priority_float * RADV_BO_PRIORITY_APPLICATION_MAX)); mem->user_ptr = NULL; + mem->bo = NULL; + +#if RADV_SUPPORT_ANDROID_HARDWARE_BUFFER + mem->android_hardware_buffer = NULL; +#endif - if (import_info) { + if (ahb_import_info) { + result = radv_import_ahb_memory(device, mem, priority, ahb_import_info); + if (result != VK_SUCCESS) + goto fail; + } else if(export_info && (export_info->handleTypes & VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID)) { + result = radv_create_ahb_memory(device, mem, priority, pAllocateInfo); + if (result != VK_SUCCESS) + goto fail; + } else if (import_info) { assert(import_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT || import_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); mem->bo = device->ws->buffer_from_fd(device->ws, import_info->fd, - priority, NULL, NULL); + priority, NULL); if (!mem->bo) { result = VK_ERROR_INVALID_EXTERNAL_HANDLE; goto fail; @@ -3388,7 +4788,7 @@ static VkResult radv_alloc_memory(struct radv_device *device, } } else if (host_ptr_info) { assert(host_ptr_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT); - assert(mem_type_index == RADV_MEM_TYPE_GTT_CACHED); + assert(radv_is_mem_type_gtt_cached(mem_type_index)); mem->bo = device->ws->buffer_from_ptr(device->ws, host_ptr_info->pHostPointer, pAllocateInfo->allocationSize, priority); @@ -3400,18 +4800,18 @@ static VkResult radv_alloc_memory(struct radv_device *device, } } else { uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096); - if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE || - mem_type_index == RADV_MEM_TYPE_GTT_CACHED) + if (radv_is_mem_type_gtt_wc(mem_type_index) || + radv_is_mem_type_gtt_cached(mem_type_index)) domain = RADEON_DOMAIN_GTT; else domain = RADEON_DOMAIN_VRAM; - if (mem_type_index == RADV_MEM_TYPE_VRAM) + if (radv_is_mem_type_vram(mem_type_index)) flags |= RADEON_FLAG_NO_CPU_ACCESS; else flags |= RADEON_FLAG_CPU_ACCESS; - if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE) + if (radv_is_mem_type_gtt_wc(mem_type_index)) flags |= RADEON_FLAG_GTT_WC; if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes)) { @@ -3421,6 +4821,11 @@ static VkResult radv_alloc_memory(struct radv_device *device, } } + if (radv_is_mem_type_uncached(mem_type_index)) { + assert(device->physical_device->rad_info.has_l2_uncached); + flags |= RADEON_FLAG_VA_UNCACHED; + } + mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment, domain, flags, priority); @@ -3433,15 +4838,14 @@ static VkResult radv_alloc_memory(struct radv_device *device, result = radv_bo_list_add(device, mem->bo); if (result != VK_SUCCESS) - goto fail_bo; + goto fail; *pMem = radv_device_memory_to_handle(mem); return VK_SUCCESS; -fail_bo: - device->ws->buffer_destroy(mem->bo); fail: + radv_free_memory(device, pAllocator,mem); vk_free2(&device->alloc, pAllocator, mem); return result; @@ -3465,14 +4869,7 @@ void radv_FreeMemory( RADV_FROM_HANDLE(radv_device, device, _device); RADV_FROM_HANDLE(radv_device_memory, mem, _mem); - if (mem == NULL) - return; - - radv_bo_list_remove(device, mem->bo); - device->ws->buffer_destroy(mem->bo); - mem->bo = NULL; - - vk_free2(&device->alloc, pAllocator, mem); + radv_free_memory(device, pAllocator, mem); } VkResult radv_MapMemory( @@ -3710,107 +5107,63 @@ VkResult radv_BindImageMemory( return radv_BindImageMemory2(device, 1, &info); } - -static void -radv_sparse_buffer_bind_memory(struct radv_device *device, - const VkSparseBufferMemoryBindInfo *bind) -{ - RADV_FROM_HANDLE(radv_buffer, buffer, bind->buffer); - - for (uint32_t i = 0; i < bind->bindCount; ++i) { - struct radv_device_memory *mem = NULL; - - if (bind->pBinds[i].memory != VK_NULL_HANDLE) - mem = radv_device_memory_from_handle(bind->pBinds[i].memory); - - device->ws->buffer_virtual_bind(buffer->bo, - bind->pBinds[i].resourceOffset, - bind->pBinds[i].size, - mem ? mem->bo : NULL, - bind->pBinds[i].memoryOffset); - } -} - -static void -radv_sparse_image_opaque_bind_memory(struct radv_device *device, - const VkSparseImageOpaqueMemoryBindInfo *bind) +static bool radv_sparse_bind_has_effects(const VkBindSparseInfo *info) { - RADV_FROM_HANDLE(radv_image, image, bind->image); - - for (uint32_t i = 0; i < bind->bindCount; ++i) { - struct radv_device_memory *mem = NULL; - - if (bind->pBinds[i].memory != VK_NULL_HANDLE) - mem = radv_device_memory_from_handle(bind->pBinds[i].memory); - - device->ws->buffer_virtual_bind(image->bo, - bind->pBinds[i].resourceOffset, - bind->pBinds[i].size, - mem ? mem->bo : NULL, - bind->pBinds[i].memoryOffset); - } + return info->bufferBindCount || + info->imageOpaqueBindCount || + info->imageBindCount || + info->waitSemaphoreCount || + info->signalSemaphoreCount; } VkResult radv_QueueBindSparse( VkQueue _queue, uint32_t bindInfoCount, const VkBindSparseInfo* pBindInfo, - VkFence _fence) + VkFence fence) { - RADV_FROM_HANDLE(radv_fence, fence, _fence); RADV_FROM_HANDLE(radv_queue, queue, _queue); - struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL; - bool fence_emitted = false; VkResult result; - int ret; + uint32_t fence_idx = 0; + + if (fence != VK_NULL_HANDLE) { + for (uint32_t i = 0; i < bindInfoCount; ++i) + if (radv_sparse_bind_has_effects(pBindInfo + i)) + fence_idx = i; + } else + fence_idx = UINT32_MAX; for (uint32_t i = 0; i < bindInfoCount; ++i) { - struct radv_winsys_sem_info sem_info; - for (uint32_t j = 0; j < pBindInfo[i].bufferBindCount; ++j) { - radv_sparse_buffer_bind_memory(queue->device, - pBindInfo[i].pBufferBinds + j); - } + if (i != fence_idx && !radv_sparse_bind_has_effects(pBindInfo + i)) + continue; - for (uint32_t j = 0; j < pBindInfo[i].imageOpaqueBindCount; ++j) { - radv_sparse_image_opaque_bind_memory(queue->device, - pBindInfo[i].pImageOpaqueBinds + j); - } + const VkTimelineSemaphoreSubmitInfoKHR *timeline_info = + vk_find_struct_const(pBindInfo[i].pNext, TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR); + + VkResult result = radv_queue_submit(queue, &(struct radv_queue_submission) { + .buffer_binds = pBindInfo[i].pBufferBinds, + .buffer_bind_count = pBindInfo[i].bufferBindCount, + .image_opaque_binds = pBindInfo[i].pImageOpaqueBinds, + .image_opaque_bind_count = pBindInfo[i].imageOpaqueBindCount, + .wait_semaphores = pBindInfo[i].pWaitSemaphores, + .wait_semaphore_count = pBindInfo[i].waitSemaphoreCount, + .signal_semaphores = pBindInfo[i].pSignalSemaphores, + .signal_semaphore_count = pBindInfo[i].signalSemaphoreCount, + .fence = i == fence_idx ? fence : VK_NULL_HANDLE, + .wait_values = timeline_info ? timeline_info->pWaitSemaphoreValues : NULL, + .wait_value_count = timeline_info && timeline_info->pWaitSemaphoreValues ? timeline_info->waitSemaphoreValueCount : 0, + .signal_values = timeline_info ? timeline_info->pSignalSemaphoreValues : NULL, + .signal_value_count = timeline_info && timeline_info->pSignalSemaphoreValues ? timeline_info->signalSemaphoreValueCount : 0, + }); - VkResult result; - result = radv_alloc_sem_info(queue->device->instance, - &sem_info, - pBindInfo[i].waitSemaphoreCount, - pBindInfo[i].pWaitSemaphores, - pBindInfo[i].signalSemaphoreCount, - pBindInfo[i].pSignalSemaphores, - _fence); if (result != VK_SUCCESS) return result; - - if (pBindInfo[i].waitSemaphoreCount || pBindInfo[i].signalSemaphoreCount) { - ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx, - &queue->device->empty_cs[queue->queue_family_index], - 1, NULL, NULL, - &sem_info, NULL, - false, base_fence); - if (ret) { - radv_loge("failed to submit CS %d\n", i); - abort(); - } - - fence_emitted = true; - } - - radv_free_sem_info(&sem_info); - } - if (fence) { - if (!fence_emitted) { - result = radv_signal_fence(queue, fence); - if (result != VK_SUCCESS) - return result; - } + if (fence != VK_NULL_HANDLE && !bindInfoCount) { + result = radv_signal_fence(queue, fence); + if (result != VK_SUCCESS) + return result; } return VK_SUCCESS; @@ -4089,6 +5442,199 @@ VkResult radv_GetFenceStatus(VkDevice _device, VkFence _fence) // Queue semaphore functions +static void +radv_create_timeline(struct radv_timeline *timeline, uint64_t value) +{ + timeline->highest_signaled = value; + timeline->highest_submitted = value; + list_inithead(&timeline->points); + list_inithead(&timeline->free_points); + list_inithead(&timeline->waiters); + pthread_mutex_init(&timeline->mutex, NULL); +} + +static void +radv_destroy_timeline(struct radv_device *device, + struct radv_timeline *timeline) +{ + list_for_each_entry_safe(struct radv_timeline_point, point, + &timeline->free_points, list) { + list_del(&point->list); + device->ws->destroy_syncobj(device->ws, point->syncobj); + free(point); + } + list_for_each_entry_safe(struct radv_timeline_point, point, + &timeline->points, list) { + list_del(&point->list); + device->ws->destroy_syncobj(device->ws, point->syncobj); + free(point); + } + pthread_mutex_destroy(&timeline->mutex); +} + +static void +radv_timeline_gc_locked(struct radv_device *device, + struct radv_timeline *timeline) +{ + list_for_each_entry_safe(struct radv_timeline_point, point, + &timeline->points, list) { + if (point->wait_count || point->value > timeline->highest_submitted) + return; + + if (device->ws->wait_syncobj(device->ws, &point->syncobj, 1, true, 0)) { + timeline->highest_signaled = point->value; + list_del(&point->list); + list_add(&point->list, &timeline->free_points); + } + } +} + +static struct radv_timeline_point * +radv_timeline_find_point_at_least_locked(struct radv_device *device, + struct radv_timeline *timeline, + uint64_t p) +{ + radv_timeline_gc_locked(device, timeline); + + if (p <= timeline->highest_signaled) + return NULL; + + list_for_each_entry(struct radv_timeline_point, point, + &timeline->points, list) { + if (point->value >= p) { + ++point->wait_count; + return point; + } + } + return NULL; +} + +static struct radv_timeline_point * +radv_timeline_add_point_locked(struct radv_device *device, + struct radv_timeline *timeline, + uint64_t p) +{ + radv_timeline_gc_locked(device, timeline); + + struct radv_timeline_point *ret = NULL; + struct radv_timeline_point *prev = NULL; + + if (p <= timeline->highest_signaled) + return NULL; + + list_for_each_entry(struct radv_timeline_point, point, + &timeline->points, list) { + if (point->value == p) { + return NULL; + } + + if (point->value < p) + prev = point; + } + + if (list_is_empty(&timeline->free_points)) { + ret = malloc(sizeof(struct radv_timeline_point)); + device->ws->create_syncobj(device->ws, &ret->syncobj); + } else { + ret = list_first_entry(&timeline->free_points, struct radv_timeline_point, list); + list_del(&ret->list); + + device->ws->reset_syncobj(device->ws, ret->syncobj); + } + + ret->value = p; + ret->wait_count = 1; + + if (prev) { + list_add(&ret->list, &prev->list); + } else { + list_addtail(&ret->list, &timeline->points); + } + return ret; +} + + +static VkResult +radv_timeline_wait_locked(struct radv_device *device, + struct radv_timeline *timeline, + uint64_t value, + uint64_t abs_timeout) +{ + while(timeline->highest_submitted < value) { + struct timespec abstime; + timespec_from_nsec(&abstime, abs_timeout); + + pthread_cond_timedwait(&device->timeline_cond, &timeline->mutex, &abstime); + + if (radv_get_current_time() >= abs_timeout && timeline->highest_submitted < value) + return VK_TIMEOUT; + } + + struct radv_timeline_point *point = radv_timeline_find_point_at_least_locked(device, timeline, value); + if (!point) + return VK_SUCCESS; + + point->wait_count++; + + pthread_mutex_unlock(&timeline->mutex); + + bool success = device->ws->wait_syncobj(device->ws, &point->syncobj, 1, true, abs_timeout); + + pthread_mutex_lock(&timeline->mutex); + point->wait_count--; + return success ? VK_SUCCESS : VK_TIMEOUT; +} + +static void +radv_timeline_trigger_waiters_locked(struct radv_timeline *timeline, + struct list_head *processing_list) +{ + list_for_each_entry_safe(struct radv_timeline_waiter, waiter, + &timeline->waiters, list) { + if (waiter->value > timeline->highest_submitted) + continue; + + if (p_atomic_dec_zero(&waiter->submission->submission_wait_count)) { + list_addtail(&waiter->submission->processing_list, processing_list); + } + list_del(&waiter->list); + } +} + +static +void radv_destroy_semaphore_part(struct radv_device *device, + struct radv_semaphore_part *part) +{ + switch(part->kind) { + case RADV_SEMAPHORE_NONE: + break; + case RADV_SEMAPHORE_WINSYS: + device->ws->destroy_sem(part->ws_sem); + break; + case RADV_SEMAPHORE_TIMELINE: + radv_destroy_timeline(device, &part->timeline); + break; + case RADV_SEMAPHORE_SYNCOBJ: + device->ws->destroy_syncobj(device->ws, part->syncobj); + break; + } + part->kind = RADV_SEMAPHORE_NONE; +} + +static VkSemaphoreTypeKHR +radv_get_semaphore_type(const void *pNext, uint64_t *initial_value) +{ + const VkSemaphoreTypeCreateInfoKHR *type_info = + vk_find_struct_const(pNext, SEMAPHORE_TYPE_CREATE_INFO_KHR); + + if (!type_info) + return VK_SEMAPHORE_TYPE_BINARY_KHR; + + if (initial_value) + *initial_value = type_info->initialValue; + return type_info->semaphoreType; +} + VkResult radv_CreateSemaphore( VkDevice _device, const VkSemaphoreCreateInfo* pCreateInfo, @@ -4100,6 +5646,8 @@ VkResult radv_CreateSemaphore( vk_find_struct_const(pCreateInfo->pNext, EXPORT_SEMAPHORE_CREATE_INFO); VkExternalSemaphoreHandleTypeFlags handleTypes = export ? export->handleTypes : 0; + uint64_t initial_value = 0; + VkSemaphoreTypeKHR type = radv_get_semaphore_type(pCreateInfo->pNext, &initial_value); struct radv_semaphore *sem = vk_alloc2(&device->alloc, pAllocator, sizeof(*sem), 8, @@ -4107,23 +5655,27 @@ VkResult radv_CreateSemaphore( if (!sem) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - sem->temp_syncobj = 0; - /* create a syncobject if we are going to export this semaphore */ - if (device->always_use_syncobj || handleTypes) { + sem->temporary.kind = RADV_SEMAPHORE_NONE; + sem->permanent.kind = RADV_SEMAPHORE_NONE; + + if (type == VK_SEMAPHORE_TYPE_TIMELINE_KHR) { + radv_create_timeline(&sem->permanent.timeline, initial_value); + sem->permanent.kind = RADV_SEMAPHORE_TIMELINE; + } else if (device->always_use_syncobj || handleTypes) { assert (device->physical_device->rad_info.has_syncobj); - int ret = device->ws->create_syncobj(device->ws, &sem->syncobj); + int ret = device->ws->create_syncobj(device->ws, &sem->permanent.syncobj); if (ret) { vk_free2(&device->alloc, pAllocator, sem); return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } - sem->sem = NULL; + sem->permanent.kind = RADV_SEMAPHORE_SYNCOBJ; } else { - sem->sem = device->ws->create_sem(device->ws); - if (!sem->sem) { + sem->permanent.ws_sem = device->ws->create_sem(device->ws); + if (!sem->permanent.ws_sem) { vk_free2(&device->alloc, pAllocator, sem); return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } - sem->syncobj = 0; + sem->permanent.kind = RADV_SEMAPHORE_WINSYS; } *pSemaphore = radv_semaphore_to_handle(sem); @@ -4140,13 +5692,115 @@ void radv_DestroySemaphore( if (!_semaphore) return; - if (sem->syncobj) - device->ws->destroy_syncobj(device->ws, sem->syncobj); - else - device->ws->destroy_sem(sem->sem); + radv_destroy_semaphore_part(device, &sem->temporary); + radv_destroy_semaphore_part(device, &sem->permanent); vk_free2(&device->alloc, pAllocator, sem); } +VkResult +radv_GetSemaphoreCounterValueKHR(VkDevice _device, + VkSemaphore _semaphore, + uint64_t* pValue) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_semaphore, semaphore, _semaphore); + + struct radv_semaphore_part *part = + semaphore->temporary.kind != RADV_SEMAPHORE_NONE ? &semaphore->temporary : &semaphore->permanent; + + switch (part->kind) { + case RADV_SEMAPHORE_TIMELINE: { + pthread_mutex_lock(&part->timeline.mutex); + radv_timeline_gc_locked(device, &part->timeline); + *pValue = part->timeline.highest_signaled; + pthread_mutex_unlock(&part->timeline.mutex); + return VK_SUCCESS; + } + case RADV_SEMAPHORE_NONE: + case RADV_SEMAPHORE_SYNCOBJ: + case RADV_SEMAPHORE_WINSYS: + unreachable("Invalid semaphore type"); + } + unreachable("Unhandled semaphore type"); +} + + +static VkResult +radv_wait_timelines(struct radv_device *device, + const VkSemaphoreWaitInfoKHR* pWaitInfo, + uint64_t abs_timeout) +{ + if ((pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR) && pWaitInfo->semaphoreCount > 1) { + for (;;) { + for(uint32_t i = 0; i < pWaitInfo->semaphoreCount; ++i) { + RADV_FROM_HANDLE(radv_semaphore, semaphore, pWaitInfo->pSemaphores[i]); + pthread_mutex_lock(&semaphore->permanent.timeline.mutex); + VkResult result = radv_timeline_wait_locked(device, &semaphore->permanent.timeline, pWaitInfo->pValues[i], 0); + pthread_mutex_unlock(&semaphore->permanent.timeline.mutex); + + if (result == VK_SUCCESS) + return VK_SUCCESS; + } + if (radv_get_current_time() > abs_timeout) + return VK_TIMEOUT; + } + } + + for(uint32_t i = 0; i < pWaitInfo->semaphoreCount; ++i) { + RADV_FROM_HANDLE(radv_semaphore, semaphore, pWaitInfo->pSemaphores[i]); + pthread_mutex_lock(&semaphore->permanent.timeline.mutex); + VkResult result = radv_timeline_wait_locked(device, &semaphore->permanent.timeline, pWaitInfo->pValues[i], abs_timeout); + pthread_mutex_unlock(&semaphore->permanent.timeline.mutex); + + if (result != VK_SUCCESS) + return result; + } + return VK_SUCCESS; +} +VkResult +radv_WaitSemaphoresKHR(VkDevice _device, + const VkSemaphoreWaitInfoKHR* pWaitInfo, + uint64_t timeout) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + uint64_t abs_timeout = radv_get_absolute_timeout(timeout); + return radv_wait_timelines(device, pWaitInfo, abs_timeout); +} + +VkResult +radv_SignalSemaphoreKHR(VkDevice _device, + const VkSemaphoreSignalInfoKHR* pSignalInfo) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_semaphore, semaphore, pSignalInfo->semaphore); + + struct radv_semaphore_part *part = + semaphore->temporary.kind != RADV_SEMAPHORE_NONE ? &semaphore->temporary : &semaphore->permanent; + + switch(part->kind) { + case RADV_SEMAPHORE_TIMELINE: { + pthread_mutex_lock(&part->timeline.mutex); + radv_timeline_gc_locked(device, &part->timeline); + part->timeline.highest_submitted = MAX2(part->timeline.highest_submitted, pSignalInfo->value); + part->timeline.highest_signaled = MAX2(part->timeline.highest_signaled, pSignalInfo->value); + + struct list_head processing_list; + list_inithead(&processing_list); + radv_timeline_trigger_waiters_locked(&part->timeline, &processing_list); + pthread_mutex_unlock(&part->timeline.mutex); + + return radv_process_submissions(&processing_list); + } + case RADV_SEMAPHORE_NONE: + case RADV_SEMAPHORE_SYNCOBJ: + case RADV_SEMAPHORE_WINSYS: + unreachable("Invalid semaphore type"); + } + return VK_SUCCESS; +} + + + VkResult radv_CreateEvent( VkDevice _device, const VkEventCreateInfo* pCreateInfo, @@ -5241,22 +6895,34 @@ VkResult radv_ImportSemaphoreFdKHR(VkDevice _device, { RADV_FROM_HANDLE(radv_device, device, _device); RADV_FROM_HANDLE(radv_semaphore, sem, pImportSemaphoreFdInfo->semaphore); - uint32_t *syncobj_dst = NULL; + VkResult result; + struct radv_semaphore_part *dst = NULL; if (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT) { - syncobj_dst = &sem->temp_syncobj; + dst = &sem->temporary; } else { - syncobj_dst = &sem->syncobj; + dst = &sem->permanent; } + uint32_t syncobj = dst->kind == RADV_SEMAPHORE_SYNCOBJ ? dst->syncobj : 0; + switch(pImportSemaphoreFdInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: - return radv_import_opaque_fd(device, pImportSemaphoreFdInfo->fd, syncobj_dst); + result = radv_import_opaque_fd(device, pImportSemaphoreFdInfo->fd, &syncobj); + break; case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: - return radv_import_sync_fd(device, pImportSemaphoreFdInfo->fd, syncobj_dst); + result = radv_import_sync_fd(device, pImportSemaphoreFdInfo->fd, &syncobj); + break; default: unreachable("Unhandled semaphore handle type"); } + + if (result == VK_SUCCESS) { + dst->syncobj = syncobj; + dst->kind = RADV_SEMAPHORE_SYNCOBJ; + } + + return result; } VkResult radv_GetSemaphoreFdKHR(VkDevice _device, @@ -5268,10 +6934,13 @@ VkResult radv_GetSemaphoreFdKHR(VkDevice _device, int ret; uint32_t syncobj_handle; - if (sem->temp_syncobj) - syncobj_handle = sem->temp_syncobj; - else - syncobj_handle = sem->syncobj; + if (sem->temporary.kind != RADV_SEMAPHORE_NONE) { + assert(sem->temporary.kind == RADV_SEMAPHORE_SYNCOBJ); + syncobj_handle = sem->temporary.syncobj; + } else { + assert(sem->permanent.kind == RADV_SEMAPHORE_SYNCOBJ); + syncobj_handle = sem->permanent.syncobj; + } switch(pGetFdInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: @@ -5280,9 +6949,8 @@ VkResult radv_GetSemaphoreFdKHR(VkDevice _device, case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: ret = device->ws->export_syncobj_to_sync_file(device->ws, syncobj_handle, pFd); if (!ret) { - if (sem->temp_syncobj) { - close (sem->temp_syncobj); - sem->temp_syncobj = 0; + if (sem->temporary.kind != RADV_SEMAPHORE_NONE) { + radv_destroy_semaphore_part(device, &sem->temporary); } else { device->ws->reset_syncobj(device->ws, syncobj_handle); } @@ -5303,11 +6971,17 @@ void radv_GetPhysicalDeviceExternalSemaphoreProperties( VkExternalSemaphoreProperties *pExternalSemaphoreProperties) { RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); + VkSemaphoreTypeKHR type = radv_get_semaphore_type(pExternalSemaphoreInfo->pNext, NULL); + + if (type == VK_SEMAPHORE_TYPE_TIMELINE_KHR) { + pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0; + pExternalSemaphoreProperties->compatibleHandleTypes = 0; + pExternalSemaphoreProperties->externalSemaphoreFeatures = 0; /* Require has_syncobj_wait_for_submit for the syncobj signal ioctl introduced at virtually the same time */ - if (pdevice->rad_info.has_syncobj_wait_for_submit && - (pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT || - pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT)) { + } else if (pdevice->rad_info.has_syncobj_wait_for_submit && + (pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT || + pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT)) { pExternalSemaphoreProperties->exportFromImportedHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; pExternalSemaphoreProperties->compatibleHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; pExternalSemaphoreProperties->externalSemaphoreFeatures = VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT |