X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_device.c;h=8989ec3553fbf78bcf98765612b786bbde53a045;hb=90819abb56f6b1a0cd4946b13b6caf24fb46e500;hp=65727571a3ad6961f2e6e3980832e5b526d671c4;hpb=ab147cba77006cdbaf774d7a627c594be8980209;p=mesa.git diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 65727571a3a..8989ec3553f 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -60,7 +60,7 @@ radv_device_get_cache_uuid(enum radeon_family family, void *uuid) memcpy(uuid, &mesa_timestamp, 4); memcpy((char*)uuid + 4, &llvm_timestamp, 4); memcpy((char*)uuid + 8, &f, 2); - snprintf((char*)uuid + 10, VK_UUID_SIZE - 10, "radv"); + snprintf((char*)uuid + 10, VK_UUID_SIZE - 10, "radv%zd", sizeof(void *)); return 0; } @@ -101,18 +101,16 @@ radv_get_device_name(enum radeon_family family, char *name, size_t name_len) case CHIP_POLARIS11: chip_string = "AMD RADV POLARIS11"; break; case CHIP_POLARIS12: chip_string = "AMD RADV POLARIS12"; break; case CHIP_STONEY: chip_string = "AMD RADV STONEY"; break; + case CHIP_VEGAM: chip_string = "AMD RADV VEGA M"; break; case CHIP_VEGA10: chip_string = "AMD RADV VEGA10"; break; case CHIP_VEGA12: chip_string = "AMD RADV VEGA12"; break; case CHIP_RAVEN: chip_string = "AMD RADV RAVEN"; break; default: chip_string = "AMD RADV unknown"; break; } - if (HAVE_LLVM > 0) { - snprintf(llvm_string, sizeof(llvm_string), - " (LLVM %i.%i.%i)", (HAVE_LLVM >> 8) & 0xff, - HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH); - } - + snprintf(llvm_string, sizeof(llvm_string), + " (LLVM %i.%i.%i)", (HAVE_LLVM >> 8) & 0xff, + HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH); snprintf(name, name_len, "%s%s", chip_string, llvm_string); } @@ -143,7 +141,7 @@ radv_physical_device_init_mem_types(struct radv_physical_device *device) gart_index = device->memory_properties.memoryHeapCount++; device->memory_properties.memoryHeaps[gart_index] = (VkMemoryHeap) { .size = device->rad_info.gart_size, - .flags = 0, + .flags = device->rad_info.has_dedicated_vram ? 0 : VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, }; } @@ -160,7 +158,8 @@ radv_physical_device_init_mem_types(struct radv_physical_device *device) device->mem_type_indices[type_count] = RADV_MEM_TYPE_GTT_WRITE_COMBINE; device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) { .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + (device->rad_info.has_dedicated_vram ? 0 : VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT), .heapIndex = gart_index, }; } @@ -178,7 +177,8 @@ radv_physical_device_init_mem_types(struct radv_physical_device *device) device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) { .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | - VK_MEMORY_PROPERTY_HOST_CACHED_BIT, + VK_MEMORY_PROPERTY_HOST_CACHED_BIT | + (device->rad_info.has_dedicated_vram ? 0 : VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT), .heapIndex = gart_index, }; } @@ -225,25 +225,43 @@ radv_physical_device_init(struct radv_physical_device *device, VkResult result; drmVersionPtr version; int fd; + int master_fd = -1; fd = open(path, O_RDWR | O_CLOEXEC); - if (fd < 0) - return vk_error(VK_ERROR_INCOMPATIBLE_DRIVER); + if (fd < 0) { + if (instance->debug_flags & RADV_DEBUG_STARTUP) + radv_logi("Could not open device '%s'", path); + + return vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER); + } version = drmGetVersion(fd); if (!version) { close(fd); - return vk_errorf(VK_ERROR_INCOMPATIBLE_DRIVER, + + if (instance->debug_flags & RADV_DEBUG_STARTUP) + radv_logi("Could not get the kernel driver version for device '%s'", path); + + return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, "failed to get version %s: %m", path); } if (strcmp(version->name, "amdgpu")) { drmFreeVersion(version); + if (master_fd != -1) + close(master_fd); close(fd); + + if (instance->debug_flags & RADV_DEBUG_STARTUP) + radv_logi("Device '%s' is not using the amdgpu kernel driver.", path); + return VK_ERROR_INCOMPATIBLE_DRIVER; } drmFreeVersion(version); + if (instance->debug_flags & RADV_DEBUG_STARTUP) + radv_logi("Found compatible device '%s'.", path); + device->_loader_data.loaderMagic = ICD_LOADER_MAGIC; device->instance = instance; assert(strlen(path) < ARRAY_SIZE(device->path)); @@ -252,10 +270,28 @@ radv_physical_device_init(struct radv_physical_device *device, device->ws = radv_amdgpu_winsys_create(fd, instance->debug_flags, instance->perftest_flags); if (!device->ws) { - result = VK_ERROR_INCOMPATIBLE_DRIVER; + result = vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER); goto fail; } + if (instance->enabled_extensions.KHR_display) { + master_fd = open(drm_device->nodes[DRM_NODE_PRIMARY], O_RDWR | O_CLOEXEC); + if (master_fd >= 0) { + uint32_t accel_working = 0; + struct drm_amdgpu_info request = { + .return_pointer = (uintptr_t)&accel_working, + .return_size = sizeof(accel_working), + .query = AMDGPU_INFO_ACCEL_WORKING + }; + + if (drmCommandWrite(master_fd, DRM_AMDGPU_INFO, &request, sizeof (struct drm_amdgpu_info)) < 0 || !accel_working) { + close(master_fd); + master_fd = -1; + } + } + } + + device->master_fd = master_fd; device->local_fd = fd; device->ws->query_info(device->ws, &device->rad_info); @@ -265,7 +301,7 @@ radv_physical_device_init(struct radv_physical_device *device, if (radv_device_get_cache_uuid(device->rad_info.family, device->cache_uuid)) { device->ws->destroy(device->ws); - result = vk_errorf(VK_ERROR_INITIALIZATION_FAILED, + result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, "cannot generate UUID"); goto fail; } @@ -275,7 +311,7 @@ radv_physical_device_init(struct radv_physical_device *device, (device->instance->perftest_flags & RADV_PERFTEST_SISCHED ? 0x1 : 0) | (device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH ? 0x2 : 0); - /* The gpu id is already embeded in the uuid so we just pass "radv" + /* The gpu id is already embedded in the uuid so we just pass "radv" * when creating the cache. */ char buf[VK_UUID_SIZE * 2 + 1]; @@ -293,10 +329,11 @@ radv_physical_device_init(struct radv_physical_device *device, device->rad_info.chip_class >= GFX9) { device->has_rbplus = true; device->rbplus_allowed = device->rad_info.family == CHIP_STONEY || - device->rad_info.family == CHIP_VEGA12; + device->rad_info.family == CHIP_VEGA12 || + device->rad_info.family == CHIP_RAVEN; } - /* The mere presense of CLEAR_STATE in the IB causes random GPU hangs + /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs * on SI. */ device->has_clear_state = device->rad_info.chip_class >= CIK; @@ -307,19 +344,34 @@ radv_physical_device_init(struct radv_physical_device *device, device->has_scissor_bug = device->rad_info.family == CHIP_VEGA10 || device->rad_info.family == CHIP_RAVEN; + /* Out-of-order primitive rasterization. */ + device->has_out_of_order_rast = device->rad_info.chip_class >= VI && + device->rad_info.max_se >= 2; + device->out_of_order_rast_allowed = device->has_out_of_order_rast && + !(device->instance->debug_flags & RADV_DEBUG_NO_OUT_OF_ORDER); + + device->dcc_msaa_allowed = + (device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA); + radv_physical_device_init_mem_types(device); radv_fill_device_extension_table(device, &device->supported_extensions); result = radv_init_wsi(device); if (result != VK_SUCCESS) { device->ws->destroy(device->ws); + vk_error(instance, result); goto fail; } + if ((device->instance->debug_flags & RADV_DEBUG_INFO)) + ac_print_gpu_info(&device->rad_info); + return VK_SUCCESS; fail: close(fd); + if (master_fd != -1) + close(master_fd); return result; } @@ -330,6 +382,8 @@ radv_physical_device_finish(struct radv_physical_device *device) device->ws->destroy(device->ws); disk_cache_destroy(device->disk_cache); close(device->local_fd); + if (device->master_fd != -1) + close(device->master_fd); } static void * @@ -376,6 +430,13 @@ static const struct debug_control radv_debug_options[] = { {"syncshaders", RADV_DEBUG_SYNC_SHADERS}, {"nosisched", RADV_DEBUG_NO_SISCHED}, {"preoptir", RADV_DEBUG_PREOPTIR}, + {"nodynamicbounds", RADV_DEBUG_NO_DYNAMIC_BOUNDS}, + {"nooutoforder", RADV_DEBUG_NO_OUT_OF_ORDER}, + {"info", RADV_DEBUG_INFO}, + {"errors", RADV_DEBUG_ERRORS}, + {"startup", RADV_DEBUG_STARTUP}, + {"checkir", RADV_DEBUG_CHECKIR}, + {"nothreadllvm", RADV_DEBUG_NOTHREADLLVM}, {NULL, 0} }; @@ -391,6 +452,7 @@ static const struct debug_control radv_perftest_options[] = { {"sisched", RADV_PERFTEST_SISCHED}, {"localbos", RADV_PERFTEST_LOCAL_BOS}, {"binning", RADV_PERFTEST_BINNING}, + {"dccmsaa", RADV_PERFTEST_DCC_MSAA}, {NULL, 0} }; @@ -412,10 +474,15 @@ radv_handle_per_app_options(struct radv_instance *instance, if (!strcmp(name, "Talos - Linux - 32bit") || !strcmp(name, "Talos - Linux - 64bit")) { - /* Force enable LLVM sisched for Talos because it looks safe - * and it gives few more FPS. - */ - instance->perftest_flags |= RADV_PERFTEST_SISCHED; + if (!(instance->debug_flags & RADV_DEBUG_NO_SISCHED)) { + /* Force enable LLVM sisched for Talos because it looks + * safe and it gives few more FPS. + */ + instance->perftest_flags |= RADV_PERFTEST_SISCHED; + } + } else if (!strcmp(name, "DOOM_VFR")) { + /* Work around a Doom VFR game bug */ + instance->debug_flags |= RADV_DEBUG_NO_DYNAMIC_BOUNDS; } } @@ -444,22 +511,13 @@ VkResult radv_CreateInstance( pCreateInfo->pApplicationInfo->apiVersion != 0) { client_version = pCreateInfo->pApplicationInfo->apiVersion; } else { - client_version = VK_MAKE_VERSION(1, 0, 0); - } - - if (VK_MAKE_VERSION(1, 0, 0) > client_version || - client_version > VK_MAKE_VERSION(1, 1, 0xfff)) { - return vk_errorf(VK_ERROR_INCOMPATIBLE_DRIVER, - "Client requested version %d.%d.%d", - VK_VERSION_MAJOR(client_version), - VK_VERSION_MINOR(client_version), - VK_VERSION_PATCH(client_version)); + radv_EnumerateInstanceVersion(&client_version); } instance = vk_zalloc2(&default_alloc, pAllocator, sizeof(*instance), 8, VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); if (!instance) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); instance->_loader_data.loaderMagic = ICD_LOADER_MAGIC; @@ -471,13 +529,23 @@ VkResult radv_CreateInstance( instance->apiVersion = client_version; instance->physicalDeviceCount = -1; + instance->debug_flags = parse_debug_string(getenv("RADV_DEBUG"), + radv_debug_options); + + instance->perftest_flags = parse_debug_string(getenv("RADV_PERFTEST"), + radv_perftest_options); + + + if (instance->debug_flags & RADV_DEBUG_STARTUP) + radv_logi("Created an instance"); + for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) { const char *ext_name = pCreateInfo->ppEnabledExtensionNames[i]; int index = radv_get_instance_extension_index(ext_name); if (index < 0 || !radv_supported_instance_extensions.extensions[index]) { vk_free2(&default_alloc, pAllocator, instance); - return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT); + return vk_error(instance, VK_ERROR_EXTENSION_NOT_PRESENT); } instance->enabled_extensions.extensions[index] = true; @@ -486,29 +554,15 @@ VkResult radv_CreateInstance( result = vk_debug_report_instance_init(&instance->debug_report_callbacks); if (result != VK_SUCCESS) { vk_free2(&default_alloc, pAllocator, instance); - return vk_error(result); + return vk_error(instance, result); } _mesa_locale_init(); VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false)); - instance->debug_flags = parse_debug_string(getenv("RADV_DEBUG"), - radv_debug_options); - - instance->perftest_flags = parse_debug_string(getenv("RADV_PERFTEST"), - radv_perftest_options); - radv_handle_per_app_options(instance, pCreateInfo->pApplicationInfo); - if (instance->debug_flags & RADV_DEBUG_NO_SISCHED) { - /* Disable sisched when the user requests it, this is mostly - * useful when the driver force-enable sisched for the given - * application. - */ - instance->perftest_flags &= ~RADV_PERFTEST_SISCHED; - } - *pInstance = radv_instance_to_handle(instance); return VK_SUCCESS; @@ -547,8 +601,12 @@ radv_enumerate_devices(struct radv_instance *instance) instance->physicalDeviceCount = 0; max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices)); + + if (instance->debug_flags & RADV_DEBUG_STARTUP) + radv_logi("Found %d drm nodes", max_devices); + if (max_devices < 1) - return vk_error(VK_ERROR_INCOMPATIBLE_DRIVER); + return vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER); for (unsigned i = 0; i < (unsigned)max_devices; i++) { if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER && @@ -630,6 +688,7 @@ void radv_GetPhysicalDeviceFeatures( VkPhysicalDevice physicalDevice, VkPhysicalDeviceFeatures* pFeatures) { + RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); memset(pFeatures, 0, sizeof(*pFeatures)); *pFeatures = (VkPhysicalDeviceFeatures) { @@ -653,7 +712,8 @@ void radv_GetPhysicalDeviceFeatures( .alphaToOne = true, .multiViewport = true, .samplerAnisotropy = true, - .textureCompressionETC2 = false, + .textureCompressionETC2 = pdevice->rad_info.chip_class >= GFX9 || + pdevice->rad_info.family == CHIP_STONEY, .textureCompressionASTC_LDR = false, .textureCompressionBC = true, .occlusionQueryPrecise = true, @@ -685,6 +745,7 @@ void radv_GetPhysicalDeviceFeatures2( VkPhysicalDevice physicalDevice, VkPhysicalDeviceFeatures2KHR *pFeatures) { + RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); vk_foreach_struct(ext, pFeatures->pNext) { switch (ext->sType) { case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES_KHR: { @@ -715,10 +776,11 @@ void radv_GetPhysicalDeviceFeatures2( case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: { VkPhysicalDevice16BitStorageFeatures *features = (VkPhysicalDevice16BitStorageFeatures*)ext; - features->storageBuffer16BitAccess = false; - features->uniformAndStorageBuffer16BitAccess = false; - features->storagePushConstant16 = false; - features->storageInputOutput16 = false; + bool enabled = HAVE_LLVM >= 0x0700 && pdevice->rad_info.chip_class >= VI; + features->storageBuffer16BitAccess = enabled; + features->uniformAndStorageBuffer16BitAccess = enabled; + features->storagePushConstant16 = enabled; + features->storageInputOutput16 = enabled; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: { @@ -727,6 +789,45 @@ void radv_GetPhysicalDeviceFeatures2( features->samplerYcbcrConversion = false; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES_EXT: { + VkPhysicalDeviceDescriptorIndexingFeaturesEXT *features = + (VkPhysicalDeviceDescriptorIndexingFeaturesEXT*)ext; + features->shaderInputAttachmentArrayDynamicIndexing = true; + features->shaderUniformTexelBufferArrayDynamicIndexing = true; + features->shaderStorageTexelBufferArrayDynamicIndexing = true; + features->shaderUniformBufferArrayNonUniformIndexing = false; + features->shaderSampledImageArrayNonUniformIndexing = false; + features->shaderStorageBufferArrayNonUniformIndexing = false; + features->shaderStorageImageArrayNonUniformIndexing = false; + features->shaderInputAttachmentArrayNonUniformIndexing = false; + features->shaderUniformTexelBufferArrayNonUniformIndexing = false; + features->shaderStorageTexelBufferArrayNonUniformIndexing = false; + features->descriptorBindingUniformBufferUpdateAfterBind = true; + features->descriptorBindingSampledImageUpdateAfterBind = true; + features->descriptorBindingStorageImageUpdateAfterBind = true; + features->descriptorBindingStorageBufferUpdateAfterBind = true; + features->descriptorBindingUniformTexelBufferUpdateAfterBind = true; + features->descriptorBindingStorageTexelBufferUpdateAfterBind = true; + features->descriptorBindingUpdateUnusedWhilePending = true; + features->descriptorBindingPartiallyBound = true; + features->descriptorBindingVariableDescriptorCount = true; + features->runtimeDescriptorArray = true; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: { + VkPhysicalDeviceConditionalRenderingFeaturesEXT *features = + (VkPhysicalDeviceConditionalRenderingFeaturesEXT*)ext; + features->conditionalRendering = true; + features->inheritedConditionalRendering = false; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: { + VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features = + (VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *)ext; + features->vertexAttributeInstanceRateDivisor = VK_TRUE; + features->vertexAttributeInstanceRateZeroDivisor = VK_TRUE; + break; + } default: break; } @@ -824,7 +925,7 @@ void radv_GetPhysicalDeviceProperties( .maxViewports = MAX_VIEWPORTS, .maxViewportDimensions = { (1 << 14), (1 << 14) }, .viewportBoundsRange = { INT16_MIN, INT16_MAX }, - .viewportSubPixelBits = 13, /* We take a float? */ + .viewportSubPixelBits = 8, .minMemoryMapAlignment = 4096, /* A page */ .minTexelBufferOffsetAlignment = 1, .minUniformBufferOffsetAlignment = 4, @@ -932,8 +1033,17 @@ void radv_GetPhysicalDeviceProperties2( (VkPhysicalDeviceSubgroupProperties*)ext; properties->subgroupSize = 64; properties->supportedStages = VK_SHADER_STAGE_ALL; - properties->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT; - properties->quadOperationsInAllStages = false; + properties->supportedOperations = + VK_SUBGROUP_FEATURE_BASIC_BIT | + VK_SUBGROUP_FEATURE_BALLOT_BIT | + VK_SUBGROUP_FEATURE_QUAD_BIT | + VK_SUBGROUP_FEATURE_VOTE_BIT; + if (pdevice->rad_info.chip_class >= VI) { + properties->supportedOperations |= + VK_SUBGROUP_FEATURE_SHUFFLE_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT; + } + properties->quadOperationsInAllStages = true; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: { @@ -954,6 +1064,94 @@ void radv_GetPhysicalDeviceProperties2( properties->filterMinmaxSingleComponentFormats = true; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_PROPERTIES_AMD: { + VkPhysicalDeviceShaderCorePropertiesAMD *properties = + (VkPhysicalDeviceShaderCorePropertiesAMD *)ext; + + /* Shader engines. */ + properties->shaderEngineCount = + pdevice->rad_info.max_se; + properties->shaderArraysPerEngineCount = + pdevice->rad_info.max_sh_per_se; + properties->computeUnitsPerShaderArray = + pdevice->rad_info.num_good_compute_units / + (pdevice->rad_info.max_se * + pdevice->rad_info.max_sh_per_se); + properties->simdPerComputeUnit = 4; + properties->wavefrontsPerSimd = + pdevice->rad_info.family == CHIP_TONGA || + pdevice->rad_info.family == CHIP_ICELAND || + pdevice->rad_info.family == CHIP_POLARIS10 || + pdevice->rad_info.family == CHIP_POLARIS11 || + pdevice->rad_info.family == CHIP_POLARIS12 || + pdevice->rad_info.family == CHIP_VEGAM ? 8 : 10; + properties->wavefrontSize = 64; + + /* SGPR. */ + properties->sgprsPerSimd = + radv_get_num_physical_sgprs(pdevice); + properties->minSgprAllocation = + pdevice->rad_info.chip_class >= VI ? 16 : 8; + properties->maxSgprAllocation = + pdevice->rad_info.family == CHIP_TONGA || + pdevice->rad_info.family == CHIP_ICELAND ? 96 : 104; + properties->sgprAllocationGranularity = + pdevice->rad_info.chip_class >= VI ? 16 : 8; + + /* VGPR. */ + properties->vgprsPerSimd = RADV_NUM_PHYSICAL_VGPRS; + properties->minVgprAllocation = 4; + properties->maxVgprAllocation = 256; + properties->vgprAllocationGranularity = 4; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: { + VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *properties = + (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext; + properties->maxVertexAttribDivisor = UINT32_MAX; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_PROPERTIES_EXT: { + VkPhysicalDeviceDescriptorIndexingPropertiesEXT *properties = + (VkPhysicalDeviceDescriptorIndexingPropertiesEXT*)ext; + properties->maxUpdateAfterBindDescriptorsInAllPools = UINT32_MAX / 64; + properties->shaderUniformBufferArrayNonUniformIndexingNative = false; + properties->shaderSampledImageArrayNonUniformIndexingNative = false; + properties->shaderStorageBufferArrayNonUniformIndexingNative = false; + properties->shaderStorageImageArrayNonUniformIndexingNative = false; + properties->shaderInputAttachmentArrayNonUniformIndexingNative = false; + properties->robustBufferAccessUpdateAfterBind = false; + properties->quadDivergentImplicitLod = false; + + size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) / + (32 /* uniform buffer, 32 due to potential space wasted on alignment */ + + 32 /* storage buffer, 32 due to potential space wasted on alignment */ + + 32 /* sampler, largest when combined with image */ + + 64 /* sampled image */ + + 64 /* storage image */); + properties->maxPerStageDescriptorUpdateAfterBindSamplers = max_descriptor_set_size; + properties->maxPerStageDescriptorUpdateAfterBindUniformBuffers = max_descriptor_set_size; + properties->maxPerStageDescriptorUpdateAfterBindStorageBuffers = max_descriptor_set_size; + properties->maxPerStageDescriptorUpdateAfterBindSampledImages = max_descriptor_set_size; + properties->maxPerStageDescriptorUpdateAfterBindStorageImages = max_descriptor_set_size; + properties->maxPerStageDescriptorUpdateAfterBindInputAttachments = max_descriptor_set_size; + properties->maxPerStageUpdateAfterBindResources = max_descriptor_set_size; + properties->maxDescriptorSetUpdateAfterBindSamplers = max_descriptor_set_size; + properties->maxDescriptorSetUpdateAfterBindUniformBuffers = max_descriptor_set_size; + properties->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_UNIFORM_BUFFERS; + properties->maxDescriptorSetUpdateAfterBindStorageBuffers = max_descriptor_set_size; + properties->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_STORAGE_BUFFERS; + properties->maxDescriptorSetUpdateAfterBindSampledImages = max_descriptor_set_size; + properties->maxDescriptorSetUpdateAfterBindStorageImages = max_descriptor_set_size; + properties->maxDescriptorSetUpdateAfterBindInputAttachments = max_descriptor_set_size; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_PROPERTIES: { + VkPhysicalDeviceProtectedMemoryProperties *properties = + (VkPhysicalDeviceProtectedMemoryProperties *)ext; + properties->protectedNoFault = false; + break; + } default: break; } @@ -1129,7 +1327,7 @@ radv_queue_init(struct radv_device *device, struct radv_queue *queue, queue->hw_ctx = device->ws->ctx_create(device->ws, queue->priority); if (!queue->hw_ctx) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); return VK_SUCCESS; } @@ -1161,37 +1359,71 @@ radv_queue_finish(struct radv_queue *queue) } static void -radv_device_init_gs_info(struct radv_device *device) +radv_bo_list_init(struct radv_bo_list *bo_list) { - switch (device->physical_device->rad_info.family) { - case CHIP_OLAND: - case CHIP_HAINAN: - case CHIP_KAVERI: - case CHIP_KABINI: - case CHIP_MULLINS: - case CHIP_ICELAND: - case CHIP_CARRIZO: - case CHIP_STONEY: - device->gs_table_depth = 16; - return; - case CHIP_TAHITI: - case CHIP_PITCAIRN: - case CHIP_VERDE: - case CHIP_BONAIRE: - case CHIP_HAWAII: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - case CHIP_VEGA10: - case CHIP_VEGA12: - case CHIP_RAVEN: - device->gs_table_depth = 32; + pthread_mutex_init(&bo_list->mutex, NULL); + bo_list->list.count = bo_list->capacity = 0; + bo_list->list.bos = NULL; +} + +static void +radv_bo_list_finish(struct radv_bo_list *bo_list) +{ + free(bo_list->list.bos); + pthread_mutex_destroy(&bo_list->mutex); +} + +static VkResult radv_bo_list_add(struct radv_device *device, + struct radeon_winsys_bo *bo) +{ + struct radv_bo_list *bo_list = &device->bo_list; + + if (unlikely(!device->use_global_bo_list)) + return VK_SUCCESS; + + pthread_mutex_lock(&bo_list->mutex); + if (bo_list->list.count == bo_list->capacity) { + unsigned capacity = MAX2(4, bo_list->capacity * 2); + void *data = realloc(bo_list->list.bos, capacity * sizeof(struct radeon_winsys_bo*)); + + if (!data) { + pthread_mutex_unlock(&bo_list->mutex); + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + bo_list->list.bos = (struct radeon_winsys_bo**)data; + bo_list->capacity = capacity; + } + + bo_list->list.bos[bo_list->list.count++] = bo; + pthread_mutex_unlock(&bo_list->mutex); + return VK_SUCCESS; +} + +static void radv_bo_list_remove(struct radv_device *device, + struct radeon_winsys_bo *bo) +{ + struct radv_bo_list *bo_list = &device->bo_list; + + if (unlikely(!device->use_global_bo_list)) return; - default: - unreachable("unknown GPU"); + + pthread_mutex_lock(&bo_list->mutex); + for(unsigned i = 0; i < bo_list->list.count; ++i) { + if (bo_list->list.bos[i] == bo) { + bo_list->list.bos[i] = bo_list->list.bos[bo_list->list.count - 1]; + --bo_list->list.count; + break; + } } + pthread_mutex_unlock(&bo_list->mutex); +} + +static void +radv_device_init_gs_info(struct radv_device *device) +{ + device->gs_table_depth = ac_get_gs_table_depth(device->physical_device->rad_info.chip_class, + device->physical_device->rad_info.family); } static int radv_get_device_extension_index(const char *name) @@ -1224,7 +1456,7 @@ VkResult radv_CreateDevice( unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32); for (uint32_t i = 0; i < num_features; i++) { if (enabled_feature[i] && !supported_feature[i]) - return vk_error(VK_ERROR_FEATURE_NOT_PRESENT); + return vk_error(physical_device->instance, VK_ERROR_FEATURE_NOT_PRESENT); } } @@ -1232,7 +1464,7 @@ VkResult radv_CreateDevice( sizeof(*device), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!device) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); device->_loader_data.loaderMagic = ICD_LOADER_MAGIC; device->instance = physical_device->instance; @@ -1249,7 +1481,7 @@ VkResult radv_CreateDevice( int index = radv_get_device_extension_index(ext_name); if (index < 0 || !physical_device->supported_extensions.extensions[index]) { vk_free(&device->alloc, device); - return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT); + return vk_error(physical_device->instance, VK_ERROR_EXTENSION_NOT_PRESENT); } device->enabled_extensions.extensions[index] = true; @@ -1257,9 +1489,17 @@ VkResult radv_CreateDevice( keep_shader_info = device->enabled_extensions.AMD_shader_info; + /* With update after bind we can't attach bo's to the command buffer + * from the descriptor set anymore, so we have to use a global BO list. + */ + device->use_global_bo_list = + device->enabled_extensions.EXT_descriptor_indexing; + mtx_init(&device->shader_slab_mutex, mtx_plain); list_inithead(&device->shader_slabs); + radv_bo_list_init(&device->bo_list); + for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i]; uint32_t qfi = queue_create->queueFamilyIndex; @@ -1289,22 +1529,22 @@ VkResult radv_CreateDevice( } device->pbb_allowed = device->physical_device->rad_info.chip_class >= GFX9 && - (device->instance->perftest_flags & RADV_PERFTEST_BINNING); + ((device->instance->perftest_flags & RADV_PERFTEST_BINNING) || + device->physical_device->rad_info.family == CHIP_RAVEN); /* Disabled and not implemented for now. */ - device->dfsm_allowed = device->pbb_allowed && false; + device->dfsm_allowed = device->pbb_allowed && + device->physical_device->rad_info.family == CHIP_RAVEN; #ifdef ANDROID device->always_use_syncobj = device->physical_device->rad_info.has_syncobj_wait_for_submit; #endif - device->llvm_supports_spill = true; - /* The maximum number of scratch waves. Scratch space isn't divided * evenly between CUs. The number is only a function of the number of CUs. * We can decrease the constant to decrease the scratch buffer size. * - * sctx->scratch_waves must be >= the maximum posible size of + * sctx->scratch_waves must be >= the maximum possible size of * 1 threadgroup, so that the hw doesn't hang from being unable * to start any. * @@ -1341,6 +1581,10 @@ VkResult radv_CreateDevice( if (!radv_init_trace(device)) goto fail; + fprintf(stderr, "*****************************************************************************\n"); + fprintf(stderr, "* WARNING: RADV_TRACE_FILE is costly and should only be used for debugging! *\n"); + fprintf(stderr, "*****************************************************************************\n"); + fprintf(stderr, "Trace file will be dumped to %s\n", filename); radv_dump_enabled_options(device, stderr); } @@ -1392,6 +1636,8 @@ VkResult radv_CreateDevice( fail_meta: radv_device_finish_meta(device); fail: + radv_bo_list_finish(&device->bo_list); + if (device->trace_bo) device->ws->buffer_destroy(device->trace_bo); @@ -1439,6 +1685,7 @@ void radv_DestroyDevice( radv_destroy_shader_slabs(device); + radv_bo_list_finish(&device->bo_list); vk_free(&device->alloc, device); } @@ -1452,7 +1699,7 @@ VkResult radv_EnumerateInstanceLayerProperties( } /* None supported at this time */ - return vk_error(VK_ERROR_LAYER_NOT_PRESENT); + return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT); } VkResult radv_EnumerateDeviceLayerProperties( @@ -1466,7 +1713,7 @@ VkResult radv_EnumerateDeviceLayerProperties( } /* None supported at this time */ - return vk_error(VK_ERROR_LAYER_NOT_PRESENT); + return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT); } void radv_GetDeviceQueue2( @@ -1661,10 +1908,30 @@ radv_get_hs_offchip_param(struct radv_device *device, uint32_t *max_offchip_buff device->physical_device->rad_info.family != CHIP_CARRIZO && device->physical_device->rad_info.family != CHIP_STONEY; unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64; - unsigned max_offchip_buffers = max_offchip_buffers_per_se * - device->physical_device->rad_info.max_se; + unsigned max_offchip_buffers; unsigned offchip_granularity; unsigned hs_offchip_param; + + /* + * Per RadeonSI: + * This must be one less than the maximum number due to a hw limitation. + * Various hardware bugs in SI, CIK, and GFX9 need this. + * + * Per AMDVLK: + * Vega10 should limit max_offchip_buffers to 508 (4 * 127). + * Gfx7 should limit max_offchip_buffers to 508 + * Gfx6 should limit max_offchip_buffers to 126 (2 * 63) + * + * Follow AMDVLK here. + */ + if (device->physical_device->rad_info.family == CHIP_VEGA10 || + device->physical_device->rad_info.chip_class == CIK || + device->physical_device->rad_info.chip_class == SI) + --max_offchip_buffers_per_se; + + max_offchip_buffers = max_offchip_buffers_per_se * + device->physical_device->rad_info.max_se; + switch (device->tess_offchip_block_dw_size) { default: assert(0); @@ -1703,6 +1970,153 @@ radv_get_hs_offchip_param(struct radv_device *device, uint32_t *max_offchip_buff return hs_offchip_param; } +static void +radv_emit_gs_ring_sizes(struct radv_queue *queue, struct radeon_cmdbuf *cs, + struct radeon_winsys_bo *esgs_ring_bo, + uint32_t esgs_ring_size, + struct radeon_winsys_bo *gsvs_ring_bo, + uint32_t gsvs_ring_size) +{ + if (!esgs_ring_bo && !gsvs_ring_bo) + return; + + if (esgs_ring_bo) + radv_cs_add_buffer(queue->device->ws, cs, esgs_ring_bo); + + if (gsvs_ring_bo) + radv_cs_add_buffer(queue->device->ws, cs, gsvs_ring_bo); + + if (queue->device->physical_device->rad_info.chip_class >= CIK) { + radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2); + radeon_emit(cs, esgs_ring_size >> 8); + radeon_emit(cs, gsvs_ring_size >> 8); + } else { + radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2); + radeon_emit(cs, esgs_ring_size >> 8); + radeon_emit(cs, gsvs_ring_size >> 8); + } +} + +static void +radv_emit_tess_factor_ring(struct radv_queue *queue, struct radeon_cmdbuf *cs, + unsigned hs_offchip_param, unsigned tf_ring_size, + struct radeon_winsys_bo *tess_rings_bo) +{ + uint64_t tf_va; + + if (!tess_rings_bo) + return; + + tf_va = radv_buffer_get_va(tess_rings_bo); + + radv_cs_add_buffer(queue->device->ws, cs, tess_rings_bo); + + if (queue->device->physical_device->rad_info.chip_class >= CIK) { + radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE, + S_030938_SIZE(tf_ring_size / 4)); + radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE, + tf_va >> 8); + if (queue->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_set_uconfig_reg(cs, R_030944_VGT_TF_MEMORY_BASE_HI, + S_030944_BASE_HI(tf_va >> 40)); + } + radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, + hs_offchip_param); + } else { + radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE, + S_008988_SIZE(tf_ring_size / 4)); + radeon_set_config_reg(cs, R_0089B8_VGT_TF_MEMORY_BASE, + tf_va >> 8); + radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM, + hs_offchip_param); + } +} + +static void +radv_emit_compute_scratch(struct radv_queue *queue, struct radeon_cmdbuf *cs, + struct radeon_winsys_bo *compute_scratch_bo) +{ + uint64_t scratch_va; + + if (!compute_scratch_bo) + return; + + scratch_va = radv_buffer_get_va(compute_scratch_bo); + + radv_cs_add_buffer(queue->device->ws, cs, compute_scratch_bo); + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2); + radeon_emit(cs, scratch_va); + radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1)); +} + +static void +radv_emit_global_shader_pointers(struct radv_queue *queue, + struct radeon_cmdbuf *cs, + struct radeon_winsys_bo *descriptor_bo) +{ + uint64_t va; + + if (!descriptor_bo) + return; + + va = radv_buffer_get_va(descriptor_bo); + + radv_cs_add_buffer(queue->device->ws, cs, descriptor_bo); + + if (queue->device->physical_device->rad_info.chip_class >= GFX9) { + uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0, + R_00B130_SPI_SHADER_USER_DATA_VS_0, + R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS, + R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS}; + + for (int i = 0; i < ARRAY_SIZE(regs); ++i) { + radv_emit_shader_pointer(queue->device, cs, regs[i], + va, true); + } + } else { + uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0, + R_00B130_SPI_SHADER_USER_DATA_VS_0, + R_00B230_SPI_SHADER_USER_DATA_GS_0, + R_00B330_SPI_SHADER_USER_DATA_ES_0, + R_00B430_SPI_SHADER_USER_DATA_HS_0, + R_00B530_SPI_SHADER_USER_DATA_LS_0}; + + for (int i = 0; i < ARRAY_SIZE(regs); ++i) { + radv_emit_shader_pointer(queue->device, cs, regs[i], + va, true); + } + } +} + +static void +radv_init_graphics_state(struct radeon_cmdbuf *cs, struct radv_queue *queue) +{ + struct radv_device *device = queue->device; + + if (device->gfx_init) { + uint64_t va = radv_buffer_get_va(device->gfx_init); + + radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, device->gfx_init_size_dw & 0xffff); + + radv_cs_add_buffer(device->ws, cs, device->gfx_init); + } else { + struct radv_physical_device *physical_device = device->physical_device; + si_emit_graphics(physical_device, cs); + } +} + +static void +radv_init_compute_state(struct radeon_cmdbuf *cs, struct radv_queue *queue) +{ + struct radv_physical_device *physical_device = queue->device->physical_device; + si_emit_compute(physical_device, cs); +} + static VkResult radv_get_preamble_cs(struct radv_queue *queue, uint32_t scratch_size, @@ -1711,9 +2125,9 @@ radv_get_preamble_cs(struct radv_queue *queue, uint32_t gsvs_ring_size, bool needs_tess_rings, bool needs_sample_positions, - struct radeon_winsys_cs **initial_full_flush_preamble_cs, - struct radeon_winsys_cs **initial_preamble_cs, - struct radeon_winsys_cs **continue_preamble_cs) + struct radeon_cmdbuf **initial_full_flush_preamble_cs, + struct radeon_cmdbuf **initial_preamble_cs, + struct radeon_cmdbuf **continue_preamble_cs) { struct radeon_winsys_bo *scratch_bo = NULL; struct radeon_winsys_bo *descriptor_bo = NULL; @@ -1721,7 +2135,7 @@ radv_get_preamble_cs(struct radv_queue *queue, struct radeon_winsys_bo *esgs_ring_bo = NULL; struct radeon_winsys_bo *gsvs_ring_bo = NULL; struct radeon_winsys_bo *tess_rings_bo = NULL; - struct radeon_winsys_cs *dest_cs[3] = {0}; + struct radeon_cmdbuf *dest_cs[3] = {0}; bool add_tess_rings = false, add_sample_positions = false; unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0; unsigned max_offchip_buffers; @@ -1846,7 +2260,7 @@ radv_get_preamble_cs(struct radv_queue *queue, descriptor_bo = queue->descriptor_bo; for(int i = 0; i < 3; ++i) { - struct radeon_winsys_cs *cs = NULL; + struct radeon_cmdbuf *cs = NULL; cs = queue->device->ws->cs_create(queue->device->ws, queue->queue_family_index ? RING_COMPUTE : RING_GFX); if (!cs) @@ -1855,19 +2269,19 @@ radv_get_preamble_cs(struct radv_queue *queue, dest_cs[i] = cs; if (scratch_bo) - radv_cs_add_buffer(queue->device->ws, cs, scratch_bo, 8); - - if (esgs_ring_bo) - radv_cs_add_buffer(queue->device->ws, cs, esgs_ring_bo, 8); + radv_cs_add_buffer(queue->device->ws, cs, scratch_bo); - if (gsvs_ring_bo) - radv_cs_add_buffer(queue->device->ws, cs, gsvs_ring_bo, 8); - - if (tess_rings_bo) - radv_cs_add_buffer(queue->device->ws, cs, tess_rings_bo, 8); - - if (descriptor_bo) - radv_cs_add_buffer(queue->device->ws, cs, descriptor_bo, 8); + /* Emit initial configuration. */ + switch (queue->queue_family_index) { + case RADV_QUEUE_GENERAL: + radv_init_graphics_state(cs, queue); + break; + case RADV_QUEUE_COMPUTE: + radv_init_compute_state(cs, queue); + break; + case RADV_QUEUE_TRANSFER: + break; + } if (descriptor_bo != queue->descriptor_bo) { uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo); @@ -1900,80 +2314,12 @@ radv_get_preamble_cs(struct radv_queue *queue, radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); } - if (esgs_ring_bo || gsvs_ring_bo) { - if (queue->device->physical_device->rad_info.chip_class >= CIK) { - radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2); - radeon_emit(cs, esgs_ring_size >> 8); - radeon_emit(cs, gsvs_ring_size >> 8); - } else { - radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2); - radeon_emit(cs, esgs_ring_size >> 8); - radeon_emit(cs, gsvs_ring_size >> 8); - } - } - - if (tess_rings_bo) { - uint64_t tf_va = radv_buffer_get_va(tess_rings_bo); - if (queue->device->physical_device->rad_info.chip_class >= CIK) { - radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE, - S_030938_SIZE(tess_factor_ring_size / 4)); - radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE, - tf_va >> 8); - if (queue->device->physical_device->rad_info.chip_class >= GFX9) { - radeon_set_uconfig_reg(cs, R_030944_VGT_TF_MEMORY_BASE_HI, - S_030944_BASE_HI(tf_va >> 40)); - } - radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, hs_offchip_param); - } else { - radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE, - S_008988_SIZE(tess_factor_ring_size / 4)); - radeon_set_config_reg(cs, R_0089B8_VGT_TF_MEMORY_BASE, - tf_va >> 8); - radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM, - hs_offchip_param); - } - } - - if (descriptor_bo) { - uint64_t va = radv_buffer_get_va(descriptor_bo); - if (queue->device->physical_device->rad_info.chip_class >= GFX9) { - uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0, - R_00B130_SPI_SHADER_USER_DATA_VS_0, - R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS, - R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS}; - - for (int i = 0; i < ARRAY_SIZE(regs); ++i) { - radeon_set_sh_reg_seq(cs, regs[i], 2); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - } - } else { - uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0, - R_00B130_SPI_SHADER_USER_DATA_VS_0, - R_00B230_SPI_SHADER_USER_DATA_GS_0, - R_00B330_SPI_SHADER_USER_DATA_ES_0, - R_00B430_SPI_SHADER_USER_DATA_HS_0, - R_00B530_SPI_SHADER_USER_DATA_LS_0}; - - for (int i = 0; i < ARRAY_SIZE(regs); ++i) { - radeon_set_sh_reg_seq(cs, regs[i], 2); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - } - } - } - - if (compute_scratch_bo) { - uint64_t scratch_va = radv_buffer_get_va(compute_scratch_bo); - uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | - S_008F04_SWIZZLE_ENABLE(1); - - radv_cs_add_buffer(queue->device->ws, cs, compute_scratch_bo, 8); - - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2); - radeon_emit(cs, scratch_va); - radeon_emit(cs, rsrc1); - } + radv_emit_gs_ring_sizes(queue, cs, esgs_ring_bo, esgs_ring_size, + gsvs_ring_bo, gsvs_ring_size); + radv_emit_tess_factor_ring(queue, cs, hs_offchip_param, + tess_factor_ring_size, tess_rings_bo); + radv_emit_global_shader_pointers(queue, cs, descriptor_bo); + radv_emit_compute_scratch(queue, cs, compute_scratch_bo); if (i == 0) { si_cs_emit_cache_flush(cs, @@ -1985,7 +2331,8 @@ radv_get_preamble_cs(struct radv_queue *queue, RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SMEM_L1 | RADV_CMD_FLAG_INV_VMEM_L1 | - RADV_CMD_FLAG_INV_GLOBAL_L2); + RADV_CMD_FLAG_INV_GLOBAL_L2 | + RADV_CMD_FLAG_START_PIPELINE_STATS, 0); } else if (i == 1) { si_cs_emit_cache_flush(cs, queue->device->physical_device->rad_info.chip_class, @@ -1995,7 +2342,8 @@ radv_get_preamble_cs(struct radv_queue *queue, RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SMEM_L1 | RADV_CMD_FLAG_INV_VMEM_L1 | - RADV_CMD_FLAG_INV_GLOBAL_L2); + RADV_CMD_FLAG_INV_GLOBAL_L2 | + RADV_CMD_FLAG_START_PIPELINE_STATS, 0); } if (!queue->device->ws->cs_finalize(cs)) @@ -2080,10 +2428,11 @@ fail: queue->device->ws->buffer_destroy(gsvs_ring_bo); if (tess_rings_bo && tess_rings_bo != queue->tess_rings_bo) queue->device->ws->buffer_destroy(tess_rings_bo); - return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + return vk_error(queue->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); } -static VkResult radv_alloc_sem_counts(struct radv_winsys_sem_counts *counts, +static VkResult radv_alloc_sem_counts(struct radv_instance *instance, + struct radv_winsys_sem_counts *counts, int num_sems, const VkSemaphore *sems, VkFence _fence, @@ -2112,14 +2461,14 @@ static VkResult radv_alloc_sem_counts(struct radv_winsys_sem_counts *counts, if (counts->syncobj_count) { counts->syncobj = (uint32_t *)malloc(sizeof(uint32_t) * counts->syncobj_count); if (!counts->syncobj) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); } if (counts->sem_count) { counts->sem = (struct radeon_winsys_sem **)malloc(sizeof(struct radeon_winsys_sem *) * counts->sem_count); if (!counts->sem) { free(counts->syncobj); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); } } @@ -2148,7 +2497,8 @@ static VkResult radv_alloc_sem_counts(struct radv_winsys_sem_counts *counts, return VK_SUCCESS; } -void radv_free_sem_info(struct radv_winsys_sem_info *sem_info) +static void +radv_free_sem_info(struct radv_winsys_sem_info *sem_info) { free(sem_info->wait.syncobj); free(sem_info->wait.sem); @@ -2171,20 +2521,22 @@ static void radv_free_temp_syncobjs(struct radv_device *device, } } -VkResult radv_alloc_sem_info(struct radv_winsys_sem_info *sem_info, - int num_wait_sems, - const VkSemaphore *wait_sems, - int num_signal_sems, - const VkSemaphore *signal_sems, - VkFence fence) +static VkResult +radv_alloc_sem_info(struct radv_instance *instance, + struct radv_winsys_sem_info *sem_info, + int num_wait_sems, + const VkSemaphore *wait_sems, + int num_signal_sems, + const VkSemaphore *signal_sems, + VkFence fence) { VkResult ret; memset(sem_info, 0, sizeof(*sem_info)); - ret = radv_alloc_sem_counts(&sem_info->wait, num_wait_sems, wait_sems, VK_NULL_HANDLE, true); + ret = radv_alloc_sem_counts(instance, &sem_info->wait, num_wait_sems, wait_sems, VK_NULL_HANDLE, true); if (ret) return ret; - ret = radv_alloc_sem_counts(&sem_info->signal, num_signal_sems, signal_sems, fence, false); + ret = radv_alloc_sem_counts(instance, &sem_info->signal, num_signal_sems, signal_sems, fence, false); if (ret) radv_free_sem_info(sem_info); @@ -2202,20 +2554,19 @@ static VkResult radv_signal_fence(struct radv_queue *queue, VkResult result; struct radv_winsys_sem_info sem_info; - result = radv_alloc_sem_info(&sem_info, 0, NULL, 0, NULL, + result = radv_alloc_sem_info(queue->device->instance, &sem_info, 0, NULL, 0, NULL, radv_fence_to_handle(fence)); if (result != VK_SUCCESS) return result; ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx, &queue->device->empty_cs[queue->queue_family_index], - 1, NULL, NULL, &sem_info, + 1, NULL, NULL, &sem_info, NULL, false, fence->fence); radv_free_sem_info(&sem_info); - /* TODO: find a better error */ if (ret) - return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + return vk_error(queue->device->instance, VK_ERROR_DEVICE_LOST); return VK_SUCCESS; } @@ -2235,7 +2586,7 @@ VkResult radv_QueueSubmit( uint32_t scratch_size = 0; uint32_t compute_scratch_size = 0; uint32_t esgs_ring_size = 0, gsvs_ring_size = 0; - struct radeon_winsys_cs *initial_preamble_cs = NULL, *initial_flush_preamble_cs = NULL, *continue_preamble_cs = NULL; + struct radeon_cmdbuf *initial_preamble_cs = NULL, *initial_flush_preamble_cs = NULL, *continue_preamble_cs = NULL; VkResult result; bool fence_emitted = false; bool tess_rings_needed = false; @@ -2266,13 +2617,14 @@ VkResult radv_QueueSubmit( return result; for (uint32_t i = 0; i < submitCount; i++) { - struct radeon_winsys_cs **cs_array; + struct radeon_cmdbuf **cs_array; bool do_flush = !i || pSubmits[i].pWaitDstStageMask; bool can_patch = true; uint32_t advance; struct radv_winsys_sem_info sem_info; - result = radv_alloc_sem_info(&sem_info, + result = radv_alloc_sem_info(queue->device->instance, + &sem_info, pSubmits[i].waitSemaphoreCount, pSubmits[i].pWaitSemaphores, pSubmits[i].signalSemaphoreCount, @@ -2286,7 +2638,7 @@ VkResult radv_QueueSubmit( ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, &queue->device->empty_cs[queue->queue_family_index], 1, NULL, NULL, - &sem_info, + &sem_info, NULL, false, base_fence); if (ret) { radv_loge("failed to submit CS %d\n", i); @@ -2298,7 +2650,7 @@ VkResult radv_QueueSubmit( continue; } - cs_array = malloc(sizeof(struct radeon_winsys_cs *) * + cs_array = malloc(sizeof(struct radeon_cmdbuf *) * (pSubmits[i].commandBufferCount)); for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { @@ -2314,7 +2666,9 @@ VkResult radv_QueueSubmit( } for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) { - struct radeon_winsys_cs *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs; + struct radeon_cmdbuf *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs; + const struct radv_winsys_bo_list *bo_list = NULL; + advance = MIN2(max_cs_submission, pSubmits[i].commandBufferCount - j); @@ -2324,11 +2678,19 @@ VkResult radv_QueueSubmit( sem_info.cs_emit_wait = j == 0; sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount; + if (unlikely(queue->device->use_global_bo_list)) { + pthread_mutex_lock(&queue->device->bo_list.mutex); + bo_list = &queue->device->bo_list.list; + } + ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, advance, initial_preamble, continue_preamble_cs, - &sem_info, + &sem_info, bo_list, can_patch, base_fence); + if (unlikely(queue->device->use_global_bo_list)) + pthread_mutex_unlock(&queue->device->bo_list.mutex); + if (ret) { radv_loge("failed to submit CS %d\n", i); abort(); @@ -2348,7 +2710,9 @@ VkResult radv_QueueSubmit( if (fence) { if (!fence_emitted) { - radv_signal_fence(queue, fence); + result = radv_signal_fence(queue, fence); + if (result != VK_SUCCESS) + return result; } fence->submitted = true; } @@ -2507,7 +2871,7 @@ static VkResult radv_alloc_memory(struct radv_device *device, mem = vk_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (mem == NULL) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); if (wsi_info && wsi_info->implicit_sync) flags |= RADEON_FLAG_IMPLICIT_SYNC; @@ -2534,11 +2898,8 @@ static VkResult radv_alloc_memory(struct radv_device *device, goto fail; } else { close(import_info->fd); - goto out_success; } - } - - if (host_ptr_info) { + } else if (host_ptr_info) { assert(host_ptr_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT); assert(mem_type_index == RADV_MEM_TYPE_GTT_CACHED); mem->bo = device->ws->buffer_from_ptr(device->ws, host_ptr_info->pHostPointer, @@ -2548,41 +2909,46 @@ static VkResult radv_alloc_memory(struct radv_device *device, goto fail; } else { mem->user_ptr = host_ptr_info->pHostPointer; - goto out_success; } - } - - uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096); - if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE || - mem_type_index == RADV_MEM_TYPE_GTT_CACHED) - domain = RADEON_DOMAIN_GTT; - else - domain = RADEON_DOMAIN_VRAM; + } else { + uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096); + if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE || + mem_type_index == RADV_MEM_TYPE_GTT_CACHED) + domain = RADEON_DOMAIN_GTT; + else + domain = RADEON_DOMAIN_VRAM; - if (mem_type_index == RADV_MEM_TYPE_VRAM) - flags |= RADEON_FLAG_NO_CPU_ACCESS; - else - flags |= RADEON_FLAG_CPU_ACCESS; + if (mem_type_index == RADV_MEM_TYPE_VRAM) + flags |= RADEON_FLAG_NO_CPU_ACCESS; + else + flags |= RADEON_FLAG_CPU_ACCESS; - if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE) - flags |= RADEON_FLAG_GTT_WC; + if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE) + flags |= RADEON_FLAG_GTT_WC; - if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes)) - flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING; + if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes)) + flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING; - mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment, - domain, flags); + mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment, + domain, flags); - if (!mem->bo) { - result = VK_ERROR_OUT_OF_DEVICE_MEMORY; - goto fail; + if (!mem->bo) { + result = VK_ERROR_OUT_OF_DEVICE_MEMORY; + goto fail; + } + mem->type_index = mem_type_index; } - mem->type_index = mem_type_index; -out_success: + + result = radv_bo_list_add(device, mem->bo); + if (result != VK_SUCCESS) + goto fail_bo; + *pMem = radv_device_memory_to_handle(mem); return VK_SUCCESS; +fail_bo: + device->ws->buffer_destroy(mem->bo); fail: vk_free2(&device->alloc, pAllocator, mem); @@ -2610,6 +2976,7 @@ void radv_FreeMemory( if (mem == NULL) return; + radv_bo_list_remove(device, mem->bo); device->ws->buffer_destroy(mem->bo); mem->bo = NULL; @@ -2642,7 +3009,7 @@ VkResult radv_MapMemory( return VK_SUCCESS; } - return vk_error(VK_ERROR_MEMORY_MAP_FAILED); + return vk_error(device->instance, VK_ERROR_MEMORY_MAP_FAILED); } void radv_UnmapMemory( @@ -2902,6 +3269,8 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device, RADV_FROM_HANDLE(radv_queue, queue, _queue); struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL; bool fence_emitted = false; + VkResult result; + int ret; for (uint32_t i = 0; i < bindInfoCount; ++i) { struct radv_winsys_sem_info sem_info; @@ -2916,7 +3285,8 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device, } VkResult result; - result = radv_alloc_sem_info(&sem_info, + result = radv_alloc_sem_info(queue->device->instance, + &sem_info, pBindInfo[i].waitSemaphoreCount, pBindInfo[i].pWaitSemaphores, pBindInfo[i].signalSemaphoreCount, @@ -2926,11 +3296,16 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device, return result; if (pBindInfo[i].waitSemaphoreCount || pBindInfo[i].signalSemaphoreCount) { - queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx, - &queue->device->empty_cs[queue->queue_family_index], - 1, NULL, NULL, - &sem_info, - false, base_fence); + ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx, + &queue->device->empty_cs[queue->queue_family_index], + 1, NULL, NULL, + &sem_info, NULL, + false, base_fence); + if (ret) { + radv_loge("failed to submit CS %d\n", i); + abort(); + } + fence_emitted = true; if (fence) fence->submitted = true; @@ -2942,7 +3317,9 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device, if (fence) { if (!fence_emitted) { - radv_signal_fence(queue, fence); + result = radv_signal_fence(queue, fence); + if (result != VK_SUCCESS) + return result; } fence->submitted = true; } @@ -2967,8 +3344,9 @@ VkResult radv_CreateFence( VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!fence) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + fence->fence_wsi = NULL; fence->submitted = false; fence->signalled = !!(pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT); fence->temp_syncobj = 0; @@ -2976,7 +3354,7 @@ VkResult radv_CreateFence( int ret = device->ws->create_syncobj(device->ws, &fence->syncobj); if (ret) { vk_free2(&device->alloc, pAllocator, fence); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) { device->ws->signal_syncobj(device->ws, fence->syncobj); @@ -2986,7 +3364,7 @@ VkResult radv_CreateFence( fence->fence = device->ws->create_fence(); if (!fence->fence) { vk_free2(&device->alloc, pAllocator, fence); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } fence->syncobj = 0; } @@ -3013,6 +3391,8 @@ void radv_DestroyFence( device->ws->destroy_syncobj(device->ws, fence->syncobj); if (fence->fence) device->ws->destroy_fence(fence->fence); + if (fence->fence_wsi) + fence->fence_wsi->destroy(fence->fence_wsi); vk_free2(&device->alloc, pAllocator, fence); } @@ -3038,7 +3418,19 @@ static bool radv_all_fences_plain_and_submitted(uint32_t fenceCount, const VkFen { for (uint32_t i = 0; i < fenceCount; ++i) { RADV_FROM_HANDLE(radv_fence, fence, pFences[i]); - if (fence->syncobj || fence->temp_syncobj || (!fence->signalled && !fence->submitted)) + if (fence->fence == NULL || fence->syncobj || + fence->temp_syncobj || + (!fence->signalled && !fence->submitted)) + return false; + } + return true; +} + +static bool radv_all_fences_syncobj(uint32_t fenceCount, const VkFence *pFences) +{ + for (uint32_t i = 0; i < fenceCount; ++i) { + RADV_FROM_HANDLE(radv_fence, fence, pFences[i]); + if (fence->syncobj == 0 && fence->temp_syncobj == 0) return false; } return true; @@ -3054,10 +3446,12 @@ VkResult radv_WaitForFences( RADV_FROM_HANDLE(radv_device, device, _device); timeout = radv_get_absolute_timeout(timeout); - if (device->always_use_syncobj) { + if (device->always_use_syncobj && + radv_all_fences_syncobj(fenceCount, pFences)) + { uint32_t *handles = malloc(sizeof(uint32_t) * fenceCount); if (!handles) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); for (uint32_t i = 0; i < fenceCount; ++i) { RADV_FROM_HANDLE(radv_fence, fence, pFences[i]); @@ -3076,7 +3470,7 @@ VkResult radv_WaitForFences( uint32_t wait_count = 0; struct radeon_winsys_fence **fences = malloc(sizeof(struct radeon_winsys_fence *) * fenceCount); if (!fences) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); for (uint32_t i = 0; i < fenceCount; ++i) { RADV_FROM_HANDLE(radv_fence, fence, pFences[i]); @@ -3124,21 +3518,34 @@ VkResult radv_WaitForFences( if (fence->signalled) continue; - if (!fence->submitted) { - while(radv_get_current_time() <= timeout && !fence->submitted) - /* Do nothing */; + if (fence->fence) { + if (!fence->submitted) { + while(radv_get_current_time() <= timeout && + !fence->submitted) + /* Do nothing */; - if (!fence->submitted) - return VK_TIMEOUT; + if (!fence->submitted) + return VK_TIMEOUT; - /* Recheck as it may have been set by submitting operations. */ - if (fence->signalled) - continue; + /* Recheck as it may have been set by + * submitting operations. */ + + if (fence->signalled) + continue; + } + + expired = device->ws->fence_wait(device->ws, + fence->fence, + true, timeout); + if (!expired) + return VK_TIMEOUT; } - expired = device->ws->fence_wait(device->ws, fence->fence, true, timeout); - if (!expired) - return VK_TIMEOUT; + if (fence->fence_wsi) { + VkResult result = fence->fence_wsi->wait(fence->fence_wsi, timeout); + if (result != VK_SUCCESS) + return result; + } fence->signalled = true; } @@ -3190,9 +3597,19 @@ VkResult radv_GetFenceStatus(VkDevice _device, VkFence _fence) return VK_SUCCESS; if (!fence->submitted) return VK_NOT_READY; - if (!device->ws->fence_wait(device->ws, fence->fence, false, 0)) - return VK_NOT_READY; + if (fence->fence) { + if (!device->ws->fence_wait(device->ws, fence->fence, false, 0)) + return VK_NOT_READY; + } + if (fence->fence_wsi) { + VkResult result = fence->fence_wsi->wait(fence->fence_wsi, 0); + if (result != VK_SUCCESS) { + if (result == VK_TIMEOUT) + return VK_NOT_READY; + return result; + } + } return VK_SUCCESS; } @@ -3215,7 +3632,7 @@ VkResult radv_CreateSemaphore( sizeof(*sem), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!sem) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); sem->temp_syncobj = 0; /* create a syncobject if we are going to export this semaphore */ @@ -3224,14 +3641,14 @@ VkResult radv_CreateSemaphore( int ret = device->ws->create_syncobj(device->ws, &sem->syncobj); if (ret) { vk_free2(&device->alloc, pAllocator, sem); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } sem->sem = NULL; } else { sem->sem = device->ws->create_sem(device->ws); if (!sem->sem) { vk_free2(&device->alloc, pAllocator, sem); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } sem->syncobj = 0; } @@ -3269,14 +3686,14 @@ VkResult radv_CreateEvent( VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!event) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); event->bo = device->ws->buffer_create(device->ws, 8, 8, RADEON_DOMAIN_GTT, RADEON_FLAG_VA_UNCACHED | RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING); if (!event->bo) { vk_free2(&device->alloc, pAllocator, event); - return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); } event->map = (uint64_t*)device->ws->buffer_map(event->bo); @@ -3345,7 +3762,7 @@ VkResult radv_CreateBuffer( buffer = vk_alloc2(&device->alloc, pAllocator, sizeof(*buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (buffer == NULL) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); buffer->size = pCreateInfo->size; buffer->usage = pCreateInfo->usage; @@ -3362,7 +3779,7 @@ VkResult radv_CreateBuffer( 4096, 0, RADEON_FLAG_VIRTUAL); if (!buffer->bo) { vk_free2(&device->alloc, pAllocator, buffer); - return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); } } @@ -3402,6 +3819,57 @@ static uint32_t radv_surface_max_layer_count(struct radv_image_view *iview) return iview->type == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth : (iview->base_layer + iview->layer_count); } +static uint32_t +radv_init_dcc_control_reg(struct radv_device *device, + struct radv_image_view *iview) +{ + unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B; + unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B; + unsigned max_compressed_block_size; + unsigned independent_64b_blocks; + + if (device->physical_device->rad_info.chip_class < VI) + return 0; + + if (iview->image->info.samples > 1) { + if (iview->image->surface.bpe == 1) + max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; + else if (iview->image->surface.bpe == 2) + max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B; + } + + if (!device->physical_device->rad_info.has_dedicated_vram) { + /* amdvlk: [min-compressed-block-size] should be set to 32 for + * dGPU and 64 for APU because all of our APUs to date use + * DIMMs which have a request granularity size of 64B while all + * other chips have a 32B request size. + */ + min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B; + } + + if (iview->image->usage & (VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_TRANSFER_SRC_BIT | + VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) { + /* If this DCC image is potentially going to be used in texture + * fetches, we need some special settings. + */ + independent_64b_blocks = 1; + max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; + } else { + /* MAX_UNCOMPRESSED_BLOCK_SIZE must be >= + * MAX_COMPRESSED_BLOCK_SIZE. Set MAX_COMPRESSED_BLOCK_SIZE as + * big as possible for better compression state. + */ + independent_64b_blocks = 0; + max_compressed_block_size = max_uncompressed_block_size; + } + + return S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) | + S_028C78_MAX_COMPRESSED_BLOCK_SIZE(max_compressed_block_size) | + S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | + S_028C78_INDEPENDENT_64B_BLOCKS(independent_64b_blocks); +} + static void radv_initialise_color_surface(struct radv_device *device, struct radv_color_buffer_info *cb, @@ -3456,7 +3924,7 @@ radv_initialise_color_surface(struct radv_device *device, cb->cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index); - if (iview->image->fmask.size) { + if (radv_image_has_fmask(iview->image)) { if (device->physical_device->rad_info.chip_class >= CIK) cb->cb_color_pitch |= S_028C64_FMASK_TILE_MAX(iview->image->fmask.pitch_in_pixels / 8 - 1); cb->cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(iview->image->fmask.tile_mode_index); @@ -3491,7 +3959,7 @@ radv_initialise_color_surface(struct radv_device *device, S_028C74_NUM_FRAGMENTS(log_samples); } - if (iview->image->fmask.size) { + if (radv_image_has_fmask(iview->image)) { va = radv_buffer_get_va(iview->bo) + iview->image->offset + iview->image->fmask.offset; cb->cb_color_fmask = va >> 8; cb->cb_color_fmask |= iview->image->fmask.tile_swizzle; @@ -3541,7 +4009,7 @@ radv_initialise_color_surface(struct radv_device *device, format != V_028C70_COLOR_24_8) | S_028C70_NUMBER_TYPE(ntype) | S_028C70_ENDIAN(endian); - if ((iview->image->info.samples > 1) && iview->image->fmask.size) { + if (radv_image_has_fmask(iview->image)) { cb->cb_color_info |= S_028C70_COMPRESSION(1); if (device->physical_device->rad_info.chip_class == SI) { unsigned fmask_bankh = util_logbase2(iview->image->fmask.bank_height); @@ -3549,48 +4017,17 @@ radv_initialise_color_surface(struct radv_device *device, } } - if (iview->image->cmask.size && + if (radv_image_has_cmask(iview->image) && !(device->instance->debug_flags & RADV_DEBUG_NO_FAST_CLEARS)) cb->cb_color_info |= S_028C70_FAST_CLEAR(1); - if (radv_vi_dcc_enabled(iview->image, iview->base_mip)) + if (radv_dcc_enabled(iview->image, iview->base_mip)) cb->cb_color_info |= S_028C70_DCC_ENABLE(1); - if (device->physical_device->rad_info.chip_class >= VI) { - unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B; - unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B; - unsigned independent_64b_blocks = 0; - unsigned max_compressed_block_size; - - /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and - 64 for APU because all of our APUs to date use DIMMs which have - a request granularity size of 64B while all other chips have a - 32B request size */ - if (!device->physical_device->rad_info.has_dedicated_vram) - min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B; - - if (iview->image->info.samples > 1) { - if (iview->image->surface.bpe == 1) - max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; - else if (iview->image->surface.bpe == 2) - max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B; - } - - if (iview->image->usage & (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | - VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) { - independent_64b_blocks = 1; - max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; - } else - max_compressed_block_size = max_uncompressed_block_size; - - cb->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) | - S_028C78_MAX_COMPRESSED_BLOCK_SIZE(max_compressed_block_size) | - S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | - S_028C78_INDEPENDENT_64B_BLOCKS(independent_64b_blocks); - } + cb->cb_dcc_control = radv_init_dcc_control_reg(device, iview); /* This must be set for fast clear to work without FMASK. */ - if (!iview->image->fmask.size && + if (!radv_image_has_fmask(iview->image) && device->physical_device->rad_info.chip_class == SI) { unsigned bankh = util_logbase2(iview->image->surface.u.legacy.bankh); cb->cb_color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh); @@ -3615,7 +4052,7 @@ radv_calc_decompress_on_z_planes(struct radv_device *device, { unsigned max_zplanes = 0; - assert(iview->image->tc_compatible_htile); + assert(radv_image_is_tc_compat_htile(iview->image)); if (device->physical_device->rad_info.chip_class >= GFX9) { /* Default value for 32-bit depth surfaces. */ @@ -3703,7 +4140,8 @@ radv_initialise_ds_surface(struct radv_device *device, ds->db_z_info = S_028038_FORMAT(format) | S_028038_NUM_SAMPLES(util_logbase2(iview->image->info.samples)) | S_028038_SW_MODE(iview->image->surface.u.gfx9.surf.swizzle_mode) | - S_028038_MAXMIP(iview->image->info.levels - 1); + S_028038_MAXMIP(iview->image->info.levels - 1) | + S_028038_ZRANGE_PRECISION(1); ds->db_stencil_info = S_02803C_FORMAT(stencil_format) | S_02803C_SW_MODE(iview->image->surface.u.gfx9.stencil.swizzle_mode); @@ -3717,7 +4155,7 @@ radv_initialise_ds_surface(struct radv_device *device, if (radv_htile_enabled(iview->image, level)) { ds->db_z_info |= S_028038_TILE_SURFACE_ENABLE(1); - if (iview->image->tc_compatible_htile) { + if (radv_image_is_tc_compat_htile(iview->image)) { unsigned max_zplanes = radv_calc_decompress_on_z_planes(device, iview); @@ -3745,7 +4183,7 @@ radv_initialise_ds_surface(struct radv_device *device, z_offs += iview->image->surface.u.legacy.level[level].offset; s_offs += iview->image->surface.u.legacy.stencil_level[level].offset; - ds->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!iview->image->tc_compatible_htile); + ds->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!radv_image_is_tc_compat_htile(iview->image)); ds->db_z_info = S_028040_FORMAT(format) | S_028040_ZRANGE_PRECISION(1); ds->db_stencil_info = S_028044_FORMAT(stencil_format); @@ -3790,7 +4228,7 @@ radv_initialise_ds_surface(struct radv_device *device, ds->db_z_info |= S_028040_TILE_SURFACE_ENABLE(1); if (!iview->image->surface.has_stencil && - !iview->image->tc_compatible_htile) + !radv_image_is_tc_compat_htile(iview->image)) /* Use all of the htile_buffer for depth if there's no stencil. */ ds->db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1); @@ -3799,7 +4237,7 @@ radv_initialise_ds_surface(struct radv_device *device, ds->db_htile_data_base = va >> 8; ds->db_htile_surface = S_028ABC_FULL_CACHE(1); - if (iview->image->tc_compatible_htile) { + if (radv_image_is_tc_compat_htile(iview->image)) { unsigned max_zplanes = radv_calc_decompress_on_z_planes(device, iview); @@ -3829,7 +4267,7 @@ VkResult radv_CreateFramebuffer( framebuffer = vk_alloc2(&device->alloc, pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (framebuffer == NULL) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); framebuffer->attachment_count = pCreateInfo->attachmentCount; framebuffer->width = pCreateInfo->width; @@ -4047,7 +4485,7 @@ VkResult radv_CreateSampler( sampler = vk_alloc2(&device->alloc, pAllocator, sizeof(*sampler), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!sampler) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); radv_init_sampler(device, sampler, pCreateInfo); *pSampler = radv_sampler_to_handle(sampler); @@ -4129,7 +4567,7 @@ VkResult radv_GetMemoryFdKHR(VkDevice _device, bool ret = radv_get_memory_fd(device, memory, pFD); if (ret == false) - return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); return VK_SUCCESS; } @@ -4138,6 +4576,8 @@ VkResult radv_GetMemoryFdPropertiesKHR(VkDevice _device, int fd, VkMemoryFdPropertiesKHR *pMemoryFdProperties) { + RADV_FROM_HANDLE(radv_device, device, _device); + switch (handleType) { case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: pMemoryFdProperties->memoryTypeBits = (1 << RADV_MEM_TYPE_COUNT) - 1; @@ -4151,7 +4591,7 @@ VkResult radv_GetMemoryFdPropertiesKHR(VkDevice _device, * * So opaque handle types fall into the default "unsupported" case. */ - return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); + return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); } } @@ -4162,7 +4602,7 @@ static VkResult radv_import_opaque_fd(struct radv_device *device, uint32_t syncobj_handle = 0; int ret = device->ws->import_syncobj(device->ws, fd, &syncobj_handle); if (ret != 0) - return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); + return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); if (*syncobj) device->ws->destroy_syncobj(device->ws, *syncobj); @@ -4183,7 +4623,7 @@ static VkResult radv_import_sync_fd(struct radv_device *device, if (!syncobj_handle) { int ret = device->ws->create_syncobj(device->ws, &syncobj_handle); if (ret) { - return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); + return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); } } @@ -4192,7 +4632,7 @@ static VkResult radv_import_sync_fd(struct radv_device *device, } else { int ret = device->ws->import_syncobj_from_sync_file(device->ws, syncobj_handle, fd); if (ret != 0) - return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); + return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); } *syncobj = syncobj_handle; @@ -4259,7 +4699,7 @@ VkResult radv_GetSemaphoreFdKHR(VkDevice _device, } if (ret) - return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); + return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); return VK_SUCCESS; } @@ -4348,7 +4788,7 @@ VkResult radv_GetFenceFdKHR(VkDevice _device, } if (ret) - return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); + return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); return VK_SUCCESS; }