X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_device.c;h=9bc44b84a2ab82f88d8602271da98df2717f9a0f;hb=06ffd299252311f57feac4474551bd5b44d3d4d4;hp=04c0bdc19414638a8ab8bd0dd15b0d0faa9b72d7;hpb=d94383970f2cad9f474760b18fd277efeda4c612;p=mesa.git diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 04c0bdc1941..9bc44b84a2a 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -32,6 +32,7 @@ #include #include #include "radv_private.h" +#include "radv_cs.h" #include "util/strtod.h" #include @@ -61,9 +62,10 @@ radv_get_function_timestamp(void *ptr, uint32_t* timestamp) } static int -radv_device_get_cache_uuid(void *uuid) +radv_device_get_cache_uuid(enum radeon_family family, void *uuid) { uint32_t mesa_timestamp, llvm_timestamp; + uint16_t f = family; memset(uuid, 0, VK_UUID_SIZE); if (radv_get_function_timestamp(radv_device_get_cache_uuid, &mesa_timestamp) || radv_get_function_timestamp(LLVMInitializeAMDGPUTargetInfo, &llvm_timestamp)) @@ -71,10 +73,121 @@ radv_device_get_cache_uuid(void *uuid) memcpy(uuid, &mesa_timestamp, 4); memcpy((char*)uuid + 4, &llvm_timestamp, 4); - snprintf((char*)uuid + 8, VK_UUID_SIZE - 8, "radv"); + memcpy((char*)uuid + 8, &f, 2); + snprintf((char*)uuid + 10, VK_UUID_SIZE - 10, "radv"); return 0; } +static const VkExtensionProperties instance_extensions[] = { + { + .extensionName = VK_KHR_SURFACE_EXTENSION_NAME, + .specVersion = 25, + }, +#ifdef VK_USE_PLATFORM_XCB_KHR + { + .extensionName = VK_KHR_XCB_SURFACE_EXTENSION_NAME, + .specVersion = 6, + }, +#endif +#ifdef VK_USE_PLATFORM_XLIB_KHR + { + .extensionName = VK_KHR_XLIB_SURFACE_EXTENSION_NAME, + .specVersion = 6, + }, +#endif +#ifdef VK_USE_PLATFORM_WAYLAND_KHR + { + .extensionName = VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME, + .specVersion = 5, + }, +#endif +}; + +static const VkExtensionProperties common_device_extensions[] = { + { + .extensionName = VK_KHR_MAINTENANCE1_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHR_SWAPCHAIN_EXTENSION_NAME, + .specVersion = 68, + }, + { + .extensionName = VK_AMD_DRAW_INDIRECT_COUNT_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME, + .specVersion = 1, + }, +}; + +static VkResult +radv_extensions_register(struct radv_instance *instance, + struct radv_extensions *extensions, + const VkExtensionProperties *new_ext, + uint32_t num_ext) +{ + size_t new_size; + VkExtensionProperties *new_ptr; + + assert(new_ext && num_ext > 0); + + if (!new_ext) + return VK_ERROR_INITIALIZATION_FAILED; + + new_size = (extensions->num_ext + num_ext) * sizeof(VkExtensionProperties); + new_ptr = vk_realloc(&instance->alloc, extensions->ext_array, + new_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + /* Old array continues to be valid, update nothing */ + if (!new_ptr) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + memcpy(&new_ptr[extensions->num_ext], new_ext, + num_ext * sizeof(VkExtensionProperties)); + extensions->ext_array = new_ptr; + extensions->num_ext += num_ext; + + return VK_SUCCESS; +} + +static void +radv_extensions_finish(struct radv_instance *instance, + struct radv_extensions *extensions) +{ + assert(extensions); + + if (!extensions) + radv_loge("Attemted to free invalid extension struct\n"); + + if (extensions->ext_array) + vk_free(&instance->alloc, extensions->ext_array); +} + +static bool +is_extension_enabled(const VkExtensionProperties *extensions, + size_t num_ext, + const char *name) +{ + assert(extensions && name); + + for (uint32_t i = 0; i < num_ext; i++) { + if (strcmp(name, extensions[i].extensionName) == 0) + return true; + } + + return false; +} + static VkResult radv_physical_device_init(struct radv_physical_device *device, struct radv_instance *instance, @@ -86,8 +199,7 @@ radv_physical_device_init(struct radv_physical_device *device, fd = open(path, O_RDWR | O_CLOEXEC); if (fd < 0) - return vk_errorf(VK_ERROR_INCOMPATIBLE_DRIVER, - "failed to open %s: %m", path); + return VK_ERROR_INCOMPATIBLE_DRIVER; version = drmGetVersion(fd); if (!version) { @@ -120,14 +232,24 @@ radv_physical_device_init(struct radv_physical_device *device, goto fail; } - if (radv_device_get_cache_uuid(device->uuid)) { + if (radv_device_get_cache_uuid(device->rad_info.family, device->uuid)) { radv_finish_wsi(device); device->ws->destroy(device->ws); + result = vk_errorf(VK_ERROR_INITIALIZATION_FAILED, + "cannot generate UUID"); goto fail; } + result = radv_extensions_register(instance, + &device->extensions, + common_device_extensions, + ARRAY_SIZE(common_device_extensions)); + if (result != VK_SUCCESS) + goto fail; + fprintf(stderr, "WARNING: radv is not a conformant vulkan implementation, testing use only.\n"); device->name = device->rad_info.name; + close(fd); return VK_SUCCESS; fail: @@ -138,41 +260,11 @@ fail: static void radv_physical_device_finish(struct radv_physical_device *device) { + radv_extensions_finish(device->instance, &device->extensions); radv_finish_wsi(device); device->ws->destroy(device->ws); } -static const VkExtensionProperties global_extensions[] = { - { - .extensionName = VK_KHR_SURFACE_EXTENSION_NAME, - .specVersion = 25, - }, -#ifdef VK_USE_PLATFORM_XCB_KHR - { - .extensionName = VK_KHR_XCB_SURFACE_EXTENSION_NAME, - .specVersion = 6, - }, -#endif -#ifdef VK_USE_PLATFORM_XLIB_KHR - { - .extensionName = VK_KHR_XLIB_SURFACE_EXTENSION_NAME, - .specVersion = 6, - }, -#endif -#ifdef VK_USE_PLATFORM_WAYLAND_KHR - { - .extensionName = VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME, - .specVersion = 5, - }, -#endif -}; - -static const VkExtensionProperties device_extensions[] = { - { - .extensionName = VK_KHR_SWAPCHAIN_EXTENSION_NAME, - .specVersion = 68, - }, -}; static void * default_alloc_func(void *pUserData, size_t size, size_t align, @@ -201,6 +293,18 @@ static const VkAllocationCallbacks default_alloc = { .pfnFree = default_free_func, }; +static const struct debug_control radv_debug_options[] = { + {"nofastclears", RADV_DEBUG_NO_FAST_CLEARS}, + {"nodcc", RADV_DEBUG_NO_DCC}, + {"shaders", RADV_DEBUG_DUMP_SHADERS}, + {"nocache", RADV_DEBUG_NO_CACHE}, + {"shaderstats", RADV_DEBUG_DUMP_SHADER_STATS}, + {"nohiz", RADV_DEBUG_NO_HIZ}, + {"nocompute", RADV_DEBUG_NO_COMPUTE_QUEUE}, + {"unsafemath", RADV_DEBUG_UNSAFE_MATH}, + {NULL, 0} +}; + VkResult radv_CreateInstance( const VkInstanceCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, @@ -228,15 +332,9 @@ VkResult radv_CreateInstance( } for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) { - bool found = false; - for (uint32_t j = 0; j < ARRAY_SIZE(global_extensions); j++) { - if (strcmp(pCreateInfo->ppEnabledExtensionNames[i], - global_extensions[j].extensionName) == 0) { - found = true; - break; - } - } - if (!found) + if (!is_extension_enabled(instance_extensions, + ARRAY_SIZE(instance_extensions), + pCreateInfo->ppEnabledExtensionNames[i])) return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT); } @@ -245,6 +343,8 @@ VkResult radv_CreateInstance( if (!instance) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + memset(instance, 0, sizeof(*instance)); + instance->_loader_data.loaderMagic = ICD_LOADER_MAGIC; if (pAllocator) @@ -259,6 +359,9 @@ VkResult radv_CreateInstance( VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false)); + instance->debug_flags = parse_debug_string(getenv("RADV_DEBUG"), + radv_debug_options); + *pInstance = radv_instance_to_handle(instance); return VK_SUCCESS; @@ -270,10 +373,8 @@ void radv_DestroyInstance( { RADV_FROM_HANDLE(radv_instance, instance, _instance); - if (instance->physicalDeviceCount > 0) { - /* We support at most one physical device. */ - assert(instance->physicalDeviceCount == 1); - radv_physical_device_finish(&instance->physicalDevice); + for (int i = 0; i < instance->physicalDeviceCount; ++i) { + radv_physical_device_finish(instance->physicalDevices + i); } VG(VALGRIND_DESTROY_MEMPOOL(instance)); @@ -293,52 +394,29 @@ VkResult radv_EnumeratePhysicalDevices( if (instance->physicalDeviceCount < 0) { char path[20]; - for (unsigned i = 0; i < 8; i++) { + instance->physicalDeviceCount = 0; + for (unsigned i = 0; i < RADV_MAX_DRM_DEVICES; i++) { snprintf(path, sizeof(path), "/dev/dri/renderD%d", 128 + i); - result = radv_physical_device_init(&instance->physicalDevice, - instance, path); - if (result != VK_ERROR_INCOMPATIBLE_DRIVER) - break; - } - - if (result == VK_ERROR_INCOMPATIBLE_DRIVER) { - instance->physicalDeviceCount = 0; - } else if (result == VK_SUCCESS) { - instance->physicalDeviceCount = 1; - } else { - return result; + result = radv_physical_device_init(instance->physicalDevices + + instance->physicalDeviceCount, + instance, path); + if (result == VK_SUCCESS) + ++instance->physicalDeviceCount; + else if (result != VK_ERROR_INCOMPATIBLE_DRIVER) + return result; } } - /* pPhysicalDeviceCount is an out parameter if pPhysicalDevices is NULL; - * otherwise it's an inout parameter. - * - * The Vulkan spec (git aaed022) says: - * - * pPhysicalDeviceCount is a pointer to an unsigned integer variable - * that is initialized with the number of devices the application is - * prepared to receive handles to. pname:pPhysicalDevices is pointer to - * an array of at least this many VkPhysicalDevice handles [...]. - * - * Upon success, if pPhysicalDevices is NULL, vkEnumeratePhysicalDevices - * overwrites the contents of the variable pointed to by - * pPhysicalDeviceCount with the number of physical devices in in the - * instance; otherwise, vkEnumeratePhysicalDevices overwrites - * pPhysicalDeviceCount with the number of physical handles written to - * pPhysicalDevices. - */ if (!pPhysicalDevices) { *pPhysicalDeviceCount = instance->physicalDeviceCount; - } else if (*pPhysicalDeviceCount >= 1) { - pPhysicalDevices[0] = radv_physical_device_to_handle(&instance->physicalDevice); - *pPhysicalDeviceCount = 1; - } else if (*pPhysicalDeviceCount < instance->physicalDeviceCount) { - return VK_INCOMPLETE; } else { - *pPhysicalDeviceCount = 0; + *pPhysicalDeviceCount = MIN2(*pPhysicalDeviceCount, instance->physicalDeviceCount); + for (unsigned i = 0; i < *pPhysicalDeviceCount; ++i) + pPhysicalDevices[i] = radv_physical_device_to_handle(instance->physicalDevices + i); } - return VK_SUCCESS; + return *pPhysicalDeviceCount < instance->physicalDeviceCount ? VK_INCOMPLETE + : VK_SUCCESS; } void radv_GetPhysicalDeviceFeatures( @@ -354,7 +432,7 @@ void radv_GetPhysicalDeviceFeatures( .fullDrawIndexUint32 = true, .imageCubeArray = true, .independentBlend = true, - .geometryShader = false, + .geometryShader = true, .tessellationShader = false, .sampleRateShading = false, .dualSrcBlend = true, @@ -368,8 +446,8 @@ void radv_GetPhysicalDeviceFeatures( .wideLines = true, .largePoints = true, .alphaToOne = true, - .multiViewport = false, - .samplerAnisotropy = false, /* FINISHME */ + .multiViewport = true, + .samplerAnisotropy = true, .textureCompressionETC2 = false, .textureCompressionASTC_LDR = false, .textureCompressionBC = true, @@ -378,18 +456,18 @@ void radv_GetPhysicalDeviceFeatures( .vertexPipelineStoresAndAtomics = true, .fragmentStoresAndAtomics = true, .shaderTessellationAndGeometryPointSize = true, - .shaderImageGatherExtended = false, - .shaderStorageImageExtendedFormats = false, + .shaderImageGatherExtended = true, + .shaderStorageImageExtendedFormats = true, .shaderStorageImageMultisample = false, .shaderUniformBufferArrayDynamicIndexing = true, .shaderSampledImageArrayDynamicIndexing = true, .shaderStorageBufferArrayDynamicIndexing = true, .shaderStorageImageArrayDynamicIndexing = true, .shaderStorageImageReadWithoutFormat = false, - .shaderStorageImageWriteWithoutFormat = true, + .shaderStorageImageWriteWithoutFormat = false, .shaderClipDistance = true, .shaderCullDistance = true, - .shaderFloat64 = false, + .shaderFloat64 = true, .shaderInt64 = false, .shaderInt16 = false, .alphaToOne = true, @@ -398,6 +476,13 @@ void radv_GetPhysicalDeviceFeatures( }; } +void radv_GetPhysicalDeviceFeatures2KHR( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceFeatures2KHR *pFeatures) +{ + return radv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features); +} + void radv_GetPhysicalDeviceProperties( VkPhysicalDevice physicalDevice, VkPhysicalDeviceProperties* pProperties) @@ -454,15 +539,15 @@ void radv_GetPhysicalDeviceProperties( .maxGeometryTotalOutputComponents = 1024, .maxFragmentInputComponents = 128, .maxFragmentOutputAttachments = 8, - .maxFragmentDualSrcAttachments = 2, + .maxFragmentDualSrcAttachments = 1, .maxFragmentCombinedOutputResources = 8, .maxComputeSharedMemorySize = 32768, .maxComputeWorkGroupCount = { 65535, 65535, 65535 }, - .maxComputeWorkGroupInvocations = 16 * 1024, + .maxComputeWorkGroupInvocations = 2048, .maxComputeWorkGroupSize = { - 16 * 1024/*devinfo->max_cs_threads*/, - 16 * 1024, - 16 * 1024 + 2048, + 2048, + 2048 }, .subPixelPrecisionBits = 4 /* FIXME */, .subTexelPrecisionBits = 4 /* FIXME */, @@ -479,13 +564,13 @@ void radv_GetPhysicalDeviceProperties( .minTexelBufferOffsetAlignment = 1, .minUniformBufferOffsetAlignment = 4, .minStorageBufferOffsetAlignment = 4, - .minTexelOffset = -8, - .maxTexelOffset = 7, - .minTexelGatherOffset = -8, - .maxTexelGatherOffset = 7, - .minInterpolationOffset = 0, /* FIXME */ - .maxInterpolationOffset = 0, /* FIXME */ - .subPixelInterpolationOffsetBits = 0, /* FIXME */ + .minTexelOffset = -32, + .maxTexelOffset = 31, + .minTexelGatherOffset = -32, + .maxTexelGatherOffset = 31, + .minInterpolationOffset = -2, + .maxInterpolationOffset = 2, + .subPixelInterpolationOffsetBits = 8, .maxFramebufferWidth = (1 << 14), .maxFramebufferHeight = (1 << 14), .maxFramebufferLayers = (1 << 10), @@ -531,77 +616,169 @@ void radv_GetPhysicalDeviceProperties( memcpy(pProperties->pipelineCacheUUID, pdevice->uuid, VK_UUID_SIZE); } +void radv_GetPhysicalDeviceProperties2KHR( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceProperties2KHR *pProperties) +{ + return radv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties); +} + +static void radv_get_physical_device_queue_family_properties( + struct radv_physical_device* pdevice, + uint32_t* pCount, + VkQueueFamilyProperties** pQueueFamilyProperties) +{ + int num_queue_families = 1; + int idx; + if (pdevice->rad_info.compute_rings > 0 && + pdevice->rad_info.chip_class >= CIK && + !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) + num_queue_families++; + + if (pQueueFamilyProperties == NULL) { + *pCount = num_queue_families; + return; + } + + if (!*pCount) + return; + + idx = 0; + if (*pCount >= 1) { + *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { + .queueFlags = VK_QUEUE_GRAPHICS_BIT | + VK_QUEUE_COMPUTE_BIT | + VK_QUEUE_TRANSFER_BIT, + .queueCount = 1, + .timestampValidBits = 64, + .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 }, + }; + idx++; + } + + if (pdevice->rad_info.compute_rings > 0 && + pdevice->rad_info.chip_class >= CIK && + !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) { + if (*pCount > idx) { + *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { + .queueFlags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT, + .queueCount = pdevice->rad_info.compute_rings, + .timestampValidBits = 64, + .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 }, + }; + idx++; + } + } + *pCount = idx; +} + void radv_GetPhysicalDeviceQueueFamilyProperties( VkPhysicalDevice physicalDevice, uint32_t* pCount, VkQueueFamilyProperties* pQueueFamilyProperties) { - if (pQueueFamilyProperties == NULL) { - *pCount = 1; + RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); + if (!pQueueFamilyProperties) { + return radv_get_physical_device_queue_family_properties(pdevice, pCount, NULL); return; } - assert(*pCount >= 1); + VkQueueFamilyProperties *properties[] = { + pQueueFamilyProperties + 0, + pQueueFamilyProperties + 1, + pQueueFamilyProperties + 2, + }; + radv_get_physical_device_queue_family_properties(pdevice, pCount, properties); + assert(*pCount <= 3); +} - *pQueueFamilyProperties = (VkQueueFamilyProperties) { - .queueFlags = VK_QUEUE_GRAPHICS_BIT | - VK_QUEUE_COMPUTE_BIT | - VK_QUEUE_TRANSFER_BIT, - .queueCount = 1, - .timestampValidBits = 64, - .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 }, +void radv_GetPhysicalDeviceQueueFamilyProperties2KHR( + VkPhysicalDevice physicalDevice, + uint32_t* pCount, + VkQueueFamilyProperties2KHR *pQueueFamilyProperties) +{ + RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); + if (!pQueueFamilyProperties) { + return radv_get_physical_device_queue_family_properties(pdevice, pCount, NULL); + return; + } + VkQueueFamilyProperties *properties[] = { + &pQueueFamilyProperties[0].queueFamilyProperties, + &pQueueFamilyProperties[1].queueFamilyProperties, + &pQueueFamilyProperties[2].queueFamilyProperties, }; + radv_get_physical_device_queue_family_properties(pdevice, pCount, properties); + assert(*pCount <= 3); } void radv_GetPhysicalDeviceMemoryProperties( VkPhysicalDevice physicalDevice, - VkPhysicalDeviceMemoryProperties* pMemoryProperties) + VkPhysicalDeviceMemoryProperties *pMemoryProperties) { RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice); - pMemoryProperties->memoryTypeCount = 4; - pMemoryProperties->memoryTypes[0] = (VkMemoryType) { + STATIC_ASSERT(RADV_MEM_TYPE_COUNT <= VK_MAX_MEMORY_TYPES); + + pMemoryProperties->memoryTypeCount = RADV_MEM_TYPE_COUNT; + pMemoryProperties->memoryTypes[RADV_MEM_TYPE_VRAM] = (VkMemoryType) { .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, - .heapIndex = 0, + .heapIndex = RADV_MEM_HEAP_VRAM, }; - pMemoryProperties->memoryTypes[1] = (VkMemoryType) { + pMemoryProperties->memoryTypes[RADV_MEM_TYPE_GTT_WRITE_COMBINE] = (VkMemoryType) { .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - .heapIndex = 2, + .heapIndex = RADV_MEM_HEAP_GTT, }; - pMemoryProperties->memoryTypes[2] = (VkMemoryType) { + pMemoryProperties->memoryTypes[RADV_MEM_TYPE_VRAM_CPU_ACCESS] = (VkMemoryType) { .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - .heapIndex = 1, + .heapIndex = RADV_MEM_HEAP_VRAM_CPU_ACCESS, }; - pMemoryProperties->memoryTypes[3] = (VkMemoryType) { + pMemoryProperties->memoryTypes[RADV_MEM_TYPE_GTT_CACHED] = (VkMemoryType) { .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT, - .heapIndex = 2, + .heapIndex = RADV_MEM_HEAP_GTT, }; - pMemoryProperties->memoryHeapCount = 3; - pMemoryProperties->memoryHeaps[0] = (VkMemoryHeap) { + STATIC_ASSERT(RADV_MEM_HEAP_COUNT <= VK_MAX_MEMORY_HEAPS); + + pMemoryProperties->memoryHeapCount = RADV_MEM_HEAP_COUNT; + pMemoryProperties->memoryHeaps[RADV_MEM_HEAP_VRAM] = (VkMemoryHeap) { .size = physical_device->rad_info.vram_size - physical_device->rad_info.visible_vram_size, .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, }; - pMemoryProperties->memoryHeaps[1] = (VkMemoryHeap) { + pMemoryProperties->memoryHeaps[RADV_MEM_HEAP_VRAM_CPU_ACCESS] = (VkMemoryHeap) { .size = physical_device->rad_info.visible_vram_size, .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, }; - pMemoryProperties->memoryHeaps[2] = (VkMemoryHeap) { + pMemoryProperties->memoryHeaps[RADV_MEM_HEAP_GTT] = (VkMemoryHeap) { .size = physical_device->rad_info.gart_size, .flags = 0, }; } -static VkResult -radv_queue_init(struct radv_device *device, struct radv_queue *queue) +void radv_GetPhysicalDeviceMemoryProperties2KHR( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceMemoryProperties2KHR *pMemoryProperties) +{ + return radv_GetPhysicalDeviceMemoryProperties(physicalDevice, + &pMemoryProperties->memoryProperties); +} + +static int +radv_queue_init(struct radv_device *device, struct radv_queue *queue, + int queue_family_index, int idx) { queue->_loader_data.loaderMagic = ICD_LOADER_MAGIC; queue->device = device; + queue->queue_family_index = queue_family_index; + queue->queue_idx = idx; + + queue->hw_ctx = device->ws->ctx_create(device->ws); + if (!queue->hw_ctx) + return VK_ERROR_OUT_OF_HOST_MEMORY; return VK_SUCCESS; } @@ -609,6 +786,51 @@ radv_queue_init(struct radv_device *device, struct radv_queue *queue) static void radv_queue_finish(struct radv_queue *queue) { + if (queue->hw_ctx) + queue->device->ws->ctx_destroy(queue->hw_ctx); + + if (queue->preamble_cs) + queue->device->ws->cs_destroy(queue->preamble_cs); + if (queue->descriptor_bo) + queue->device->ws->buffer_destroy(queue->descriptor_bo); + if (queue->scratch_bo) + queue->device->ws->buffer_destroy(queue->scratch_bo); + if (queue->esgs_ring_bo) + queue->device->ws->buffer_destroy(queue->esgs_ring_bo); + if (queue->gsvs_ring_bo) + queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); + if (queue->compute_scratch_bo) + queue->device->ws->buffer_destroy(queue->compute_scratch_bo); +} + +static void +radv_device_init_gs_info(struct radv_device *device) +{ + switch (device->physical_device->rad_info.family) { + case CHIP_OLAND: + case CHIP_HAINAN: + case CHIP_KAVERI: + case CHIP_KABINI: + case CHIP_MULLINS: + case CHIP_ICELAND: + case CHIP_CARRIZO: + case CHIP_STONEY: + device->gs_table_depth = 16; + return; + case CHIP_TAHITI: + case CHIP_PITCAIRN: + case CHIP_VERDE: + case CHIP_BONAIRE: + case CHIP_HAWAII: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + device->gs_table_depth = 32; + return; + default: + unreachable("unknown GPU"); + } } VkResult radv_CreateDevice( @@ -622,15 +844,9 @@ VkResult radv_CreateDevice( struct radv_device *device; for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) { - bool found = false; - for (uint32_t j = 0; j < ARRAY_SIZE(device_extensions); j++) { - if (strcmp(pCreateInfo->ppEnabledExtensionNames[i], - device_extensions[j].extensionName) == 0) { - found = true; - break; - } - } - if (!found) + if (!is_extension_enabled(physical_device->extensions.ext_array, + physical_device->extensions.num_ext, + pCreateInfo->ppEnabledExtensionNames[i])) return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT); } @@ -640,8 +856,13 @@ VkResult radv_CreateDevice( if (!device) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + memset(device, 0, sizeof(*device)); + device->_loader_data.loaderMagic = ICD_LOADER_MAGIC; device->instance = physical_device->instance; + device->physical_device = physical_device; + + device->debug_flags = device->instance->debug_flags; device->ws = physical_device->ws; if (pAllocator) @@ -649,34 +870,106 @@ VkResult radv_CreateDevice( else device->alloc = physical_device->instance->alloc; - device->hw_ctx = device->ws->ctx_create(device->ws); - if (!device->hw_ctx) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail_free; + for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { + const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i]; + uint32_t qfi = queue_create->queueFamilyIndex; + + device->queues[qfi] = vk_alloc(&device->alloc, + queue_create->queueCount * sizeof(struct radv_queue), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!device->queues[qfi]) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail; + } + + memset(device->queues[qfi], 0, queue_create->queueCount * sizeof(struct radv_queue)); + + device->queue_count[qfi] = queue_create->queueCount; + + for (unsigned q = 0; q < queue_create->queueCount; q++) { + result = radv_queue_init(device, &device->queues[qfi][q], qfi, q); + if (result != VK_SUCCESS) + goto fail; + } } - radv_queue_init(device, &device->queue); +#if HAVE_LLVM < 0x0400 + device->llvm_supports_spill = false; +#else + device->llvm_supports_spill = true; +#endif + + /* The maximum number of scratch waves. Scratch space isn't divided + * evenly between CUs. The number is only a function of the number of CUs. + * We can decrease the constant to decrease the scratch buffer size. + * + * sctx->scratch_waves must be >= the maximum posible size of + * 1 threadgroup, so that the hw doesn't hang from being unable + * to start any. + * + * The recommended value is 4 per CU at most. Higher numbers don't + * bring much benefit, but they still occupy chip resources (think + * async compute). I've seen ~2% performance difference between 4 and 32. + */ + uint32_t max_threads_per_block = 2048; + device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units, + max_threads_per_block / 64); + + radv_device_init_gs_info(device); result = radv_device_init_meta(device); - if (result != VK_SUCCESS) { - device->ws->ctx_destroy(device->hw_ctx); - goto fail_free; + if (result != VK_SUCCESS) + goto fail; + + radv_device_init_msaa(device); + + for (int family = 0; family < RADV_MAX_QUEUE_FAMILIES; ++family) { + device->empty_cs[family] = device->ws->cs_create(device->ws, family); + switch (family) { + case RADV_QUEUE_GENERAL: + radeon_emit(device->empty_cs[family], PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); + radeon_emit(device->empty_cs[family], CONTEXT_CONTROL_LOAD_ENABLE(1)); + radeon_emit(device->empty_cs[family], CONTEXT_CONTROL_SHADOW_ENABLE(1)); + break; + case RADV_QUEUE_COMPUTE: + radeon_emit(device->empty_cs[family], PKT3(PKT3_NOP, 0, 0)); + radeon_emit(device->empty_cs[family], 0); + break; + } + device->ws->cs_finalize(device->empty_cs[family]); } - device->allow_fast_clears = env_var_as_boolean("RADV_FAST_CLEARS", false); - device->allow_dcc = !env_var_as_boolean("RADV_DCC_DISABLE", false); - if (device->allow_fast_clears && device->allow_dcc) - radv_finishme("DCC fast clears have not been tested\n"); + if (getenv("RADV_TRACE_FILE")) { + device->trace_bo = device->ws->buffer_create(device->ws, 4096, 8, + RADEON_DOMAIN_VRAM, RADEON_FLAG_CPU_ACCESS); + if (!device->trace_bo) + goto fail; + + device->trace_id_ptr = device->ws->buffer_map(device->trace_bo); + if (!device->trace_id_ptr) + goto fail; + } + + /* temporarily disabled on CIK */ + if (device->physical_device->rad_info.chip_class > CIK) + cik_create_gfx_config(device); - radv_device_init_msaa(device); - device->empty_cs = device->ws->cs_create(device->ws, RING_GFX); - radeon_emit(device->empty_cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); - radeon_emit(device->empty_cs, CONTEXT_CONTROL_LOAD_ENABLE(1)); - radeon_emit(device->empty_cs, CONTEXT_CONTROL_SHADOW_ENABLE(1)); - device->ws->cs_finalize(device->empty_cs); *pDevice = radv_device_to_handle(device); return VK_SUCCESS; -fail_free: + +fail: + if (device->trace_bo) + device->ws->buffer_destroy(device->trace_bo); + + if (device->gfx_init) + device->ws->buffer_destroy(device->gfx_init); + + for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) { + for (unsigned q = 0; q < device->queue_count[i]; q++) + radv_queue_finish(&device->queues[i][q]); + if (device->queue_count[i]) + vk_free(&device->alloc, device->queues[i]); + } + vk_free(&device->alloc, device); return result; } @@ -687,8 +980,18 @@ void radv_DestroyDevice( { RADV_FROM_HANDLE(radv_device, device, _device); - device->ws->ctx_destroy(device->hw_ctx); - radv_queue_finish(&device->queue); + if (device->trace_bo) + device->ws->buffer_destroy(device->trace_bo); + + if (device->gfx_init) + device->ws->buffer_destroy(device->gfx_init); + + for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) { + for (unsigned q = 0; q < device->queue_count[i]; q++) + radv_queue_finish(&device->queues[i][q]); + if (device->queue_count[i]) + vk_free(&device->alloc, device->queues[i]); + } radv_device_finish_meta(device); vk_free(&device->alloc, device); @@ -699,17 +1002,15 @@ VkResult radv_EnumerateInstanceExtensionProperties( uint32_t* pPropertyCount, VkExtensionProperties* pProperties) { - unsigned i; if (pProperties == NULL) { - *pPropertyCount = ARRAY_SIZE(global_extensions); + *pPropertyCount = ARRAY_SIZE(instance_extensions); return VK_SUCCESS; } - for (i = 0; i < *pPropertyCount; i++) - memcpy(&pProperties[i], &global_extensions[i], sizeof(VkExtensionProperties)); + *pPropertyCount = MIN2(*pPropertyCount, ARRAY_SIZE(instance_extensions)); + typed_memcpy(pProperties, instance_extensions, *pPropertyCount); - *pPropertyCount = i; - if (i < ARRAY_SIZE(global_extensions)) + if (*pPropertyCount < ARRAY_SIZE(instance_extensions)) return VK_INCOMPLETE; return VK_SUCCESS; @@ -721,19 +1022,19 @@ VkResult radv_EnumerateDeviceExtensionProperties( uint32_t* pPropertyCount, VkExtensionProperties* pProperties) { - unsigned i; + RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); if (pProperties == NULL) { - *pPropertyCount = ARRAY_SIZE(device_extensions); + *pPropertyCount = pdevice->extensions.num_ext; return VK_SUCCESS; } - for (i = 0; i < *pPropertyCount; i++) - memcpy(&pProperties[i], &device_extensions[i], sizeof(VkExtensionProperties)); + *pPropertyCount = MIN2(*pPropertyCount, pdevice->extensions.num_ext); + typed_memcpy(pProperties, pdevice->extensions.ext_array, *pPropertyCount); - *pPropertyCount = i; - if (i < ARRAY_SIZE(device_extensions)) + if (*pPropertyCount < pdevice->extensions.num_ext) return VK_INCOMPLETE; + return VK_SUCCESS; } @@ -766,15 +1067,357 @@ VkResult radv_EnumerateDeviceLayerProperties( void radv_GetDeviceQueue( VkDevice _device, - uint32_t queueNodeIndex, + uint32_t queueFamilyIndex, uint32_t queueIndex, VkQueue* pQueue) { RADV_FROM_HANDLE(radv_device, device, _device); - assert(queueIndex == 0); + *pQueue = radv_queue_to_handle(&device->queues[queueFamilyIndex][queueIndex]); +} + +static void radv_dump_trace(struct radv_device *device, + struct radeon_winsys_cs *cs) +{ + const char *filename = getenv("RADV_TRACE_FILE"); + FILE *f = fopen(filename, "w"); + if (!f) { + fprintf(stderr, "Failed to write trace dump to %s\n", filename); + return; + } + + fprintf(f, "Trace ID: %x\n", *device->trace_id_ptr); + device->ws->cs_dump(cs, f, *device->trace_id_ptr); + fclose(f); +} - *pQueue = radv_queue_to_handle(&device->queue); +static void +fill_geom_rings(struct radv_queue *queue, + uint32_t *map, + uint32_t esgs_ring_size, + struct radeon_winsys_bo *esgs_ring_bo, + uint32_t gsvs_ring_size, + struct radeon_winsys_bo *gsvs_ring_bo) +{ + uint64_t esgs_va = 0, gsvs_va = 0; + uint32_t *desc = &map[4]; + + if (esgs_ring_bo) + esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo); + if (gsvs_ring_bo) + gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo); + + /* stride 0, num records - size, add tid, swizzle, elsize4, + index stride 64 */ + desc[0] = esgs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32) | + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(true); + desc[2] = esgs_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(1) | + S_008F0C_INDEX_STRIDE(3) | + S_008F0C_ADD_TID_ENABLE(true); + + desc += 4; + /* GS entry for ES->GS ring */ + /* stride 0, num records - size, elsize0, + index stride 0 */ + desc[0] = esgs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32)| + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(false); + desc[2] = esgs_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(0) | + S_008F0C_INDEX_STRIDE(0) | + S_008F0C_ADD_TID_ENABLE(false); + + desc += 4; + /* VS entry for GS->VS ring */ + /* stride 0, num records - size, elsize0, + index stride 0 */ + desc[0] = gsvs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)| + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(false); + desc[2] = gsvs_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(0) | + S_008F0C_INDEX_STRIDE(0) | + S_008F0C_ADD_TID_ENABLE(false); + desc += 4; + + /* stride gsvs_itemsize, num records 64 + elsize 4, index stride 16 */ + /* shader will patch stride and desc[2] */ + desc[0] = gsvs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)| + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(true); + desc[2] = 0; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(1) | + S_008F0C_INDEX_STRIDE(1) | + S_008F0C_ADD_TID_ENABLE(true); +} + +static VkResult +radv_get_preamble_cs(struct radv_queue *queue, + uint32_t scratch_size, + uint32_t compute_scratch_size, + uint32_t esgs_ring_size, + uint32_t gsvs_ring_size, + struct radeon_winsys_cs **preamble_cs) +{ + struct radeon_winsys_bo *scratch_bo = NULL; + struct radeon_winsys_bo *descriptor_bo = NULL; + struct radeon_winsys_bo *compute_scratch_bo = NULL; + struct radeon_winsys_bo *esgs_ring_bo = NULL; + struct radeon_winsys_bo *gsvs_ring_bo = NULL; + struct radeon_winsys_cs *cs = NULL; + + if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) { + *preamble_cs = NULL; + return VK_SUCCESS; + } + + if (scratch_size <= queue->scratch_size && + compute_scratch_size <= queue->compute_scratch_size && + esgs_ring_size <= queue->esgs_ring_size && + gsvs_ring_size <= queue->gsvs_ring_size) { + *preamble_cs = queue->preamble_cs; + return VK_SUCCESS; + } + + if (scratch_size > queue->scratch_size) { + scratch_bo = queue->device->ws->buffer_create(queue->device->ws, + scratch_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!scratch_bo) + goto fail; + } else + scratch_bo = queue->scratch_bo; + + if (compute_scratch_size > queue->compute_scratch_size) { + compute_scratch_bo = queue->device->ws->buffer_create(queue->device->ws, + compute_scratch_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!compute_scratch_bo) + goto fail; + + } else + compute_scratch_bo = queue->compute_scratch_bo; + + if (esgs_ring_size > queue->esgs_ring_size) { + esgs_ring_bo = queue->device->ws->buffer_create(queue->device->ws, + esgs_ring_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!esgs_ring_bo) + goto fail; + } else { + esgs_ring_bo = queue->esgs_ring_bo; + esgs_ring_size = queue->esgs_ring_size; + } + + if (gsvs_ring_size > queue->gsvs_ring_size) { + gsvs_ring_bo = queue->device->ws->buffer_create(queue->device->ws, + gsvs_ring_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!gsvs_ring_bo) + goto fail; + } else { + gsvs_ring_bo = queue->gsvs_ring_bo; + gsvs_ring_size = queue->gsvs_ring_size; + } + + if (scratch_bo != queue->scratch_bo || + esgs_ring_bo != queue->esgs_ring_bo || + gsvs_ring_bo != queue->gsvs_ring_bo) { + uint32_t size = 0; + if (gsvs_ring_bo || esgs_ring_bo) + size = 80; /* 2 dword + 2 padding + 4 dword * 4 */ + else if (scratch_bo) + size = 8; /* 2 dword */ + + descriptor_bo = queue->device->ws->buffer_create(queue->device->ws, + size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_CPU_ACCESS); + if (!descriptor_bo) + goto fail; + } else + descriptor_bo = queue->descriptor_bo; + + cs = queue->device->ws->cs_create(queue->device->ws, + queue->queue_family_index ? RING_COMPUTE : RING_GFX); + if (!cs) + goto fail; + + + if (scratch_bo) + queue->device->ws->cs_add_buffer(cs, scratch_bo, 8); + + if (esgs_ring_bo) + queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8); + + if (gsvs_ring_bo) + queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8); + + if (descriptor_bo) + queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8); + + if (descriptor_bo != queue->descriptor_bo) { + uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo); + + if (scratch_bo) { + uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo); + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1); + map[0] = scratch_va; + map[1] = rsrc1; + } + + if (esgs_ring_bo || gsvs_ring_bo) + fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo); + + queue->device->ws->buffer_unmap(descriptor_bo); + } + + if (esgs_ring_bo || gsvs_ring_bo) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + + if (queue->device->physical_device->rad_info.chip_class >= CIK) { + radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2); + radeon_emit(cs, esgs_ring_size >> 8); + radeon_emit(cs, gsvs_ring_size >> 8); + } else { + radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2); + radeon_emit(cs, esgs_ring_size >> 8); + radeon_emit(cs, gsvs_ring_size >> 8); + } + } + + if (descriptor_bo) { + uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0, + R_00B130_SPI_SHADER_USER_DATA_VS_0, + R_00B230_SPI_SHADER_USER_DATA_GS_0, + R_00B330_SPI_SHADER_USER_DATA_ES_0, + R_00B430_SPI_SHADER_USER_DATA_HS_0, + R_00B530_SPI_SHADER_USER_DATA_LS_0}; + + uint64_t va = queue->device->ws->buffer_get_va(descriptor_bo); + + for (int i = 0; i < ARRAY_SIZE(regs); ++i) { + radeon_set_sh_reg_seq(cs, regs[i], 2); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + } + } + + if (compute_scratch_bo) { + uint64_t scratch_va = queue->device->ws->buffer_get_va(compute_scratch_bo); + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1); + + queue->device->ws->cs_add_buffer(cs, compute_scratch_bo, 8); + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2); + radeon_emit(cs, scratch_va); + radeon_emit(cs, rsrc1); + } + + if (!queue->device->ws->cs_finalize(cs)) + goto fail; + + if (queue->preamble_cs) + queue->device->ws->cs_destroy(queue->preamble_cs); + + queue->preamble_cs = cs; + + if (scratch_bo != queue->scratch_bo) { + if (queue->scratch_bo) + queue->device->ws->buffer_destroy(queue->scratch_bo); + queue->scratch_bo = scratch_bo; + queue->scratch_size = scratch_size; + } + + if (compute_scratch_bo != queue->compute_scratch_bo) { + if (queue->compute_scratch_bo) + queue->device->ws->buffer_destroy(queue->compute_scratch_bo); + queue->compute_scratch_bo = compute_scratch_bo; + queue->compute_scratch_size = compute_scratch_size; + } + + if (esgs_ring_bo != queue->esgs_ring_bo) { + if (queue->esgs_ring_bo) + queue->device->ws->buffer_destroy(queue->esgs_ring_bo); + queue->esgs_ring_bo = esgs_ring_bo; + queue->esgs_ring_size = esgs_ring_size; + } + + if (gsvs_ring_bo != queue->gsvs_ring_bo) { + if (queue->gsvs_ring_bo) + queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); + queue->gsvs_ring_bo = gsvs_ring_bo; + queue->gsvs_ring_size = gsvs_ring_size; + } + + if (descriptor_bo != queue->descriptor_bo) { + if (queue->descriptor_bo) + queue->device->ws->buffer_destroy(queue->descriptor_bo); + + queue->descriptor_bo = descriptor_bo; + } + + *preamble_cs = cs; + return VK_SUCCESS; +fail: + if (cs) + queue->device->ws->cs_destroy(cs); + if (descriptor_bo && descriptor_bo != queue->descriptor_bo) + queue->device->ws->buffer_destroy(descriptor_bo); + if (scratch_bo && scratch_bo != queue->scratch_bo) + queue->device->ws->buffer_destroy(scratch_bo); + if (compute_scratch_bo && compute_scratch_bo != queue->compute_scratch_bo) + queue->device->ws->buffer_destroy(compute_scratch_bo); + if (esgs_ring_bo && esgs_ring_bo != queue->esgs_ring_bo) + queue->device->ws->buffer_destroy(esgs_ring_bo); + if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo) + queue->device->ws->buffer_destroy(gsvs_ring_bo); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; } VkResult radv_QueueSubmit( @@ -786,40 +1429,129 @@ VkResult radv_QueueSubmit( RADV_FROM_HANDLE(radv_queue, queue, _queue); RADV_FROM_HANDLE(radv_fence, fence, _fence); struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL; - struct radeon_winsys_ctx *ctx = queue->device->hw_ctx; + struct radeon_winsys_ctx *ctx = queue->hw_ctx; int ret; + uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX; + uint32_t scratch_size = 0; + uint32_t compute_scratch_size = 0; + uint32_t esgs_ring_size = 0, gsvs_ring_size = 0; + struct radeon_winsys_cs *preamble_cs = NULL; + VkResult result; + bool fence_emitted = false; + + /* Do this first so failing to allocate scratch buffers can't result in + * partially executed submissions. */ + for (uint32_t i = 0; i < submitCount; i++) { + for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, + pSubmits[i].pCommandBuffers[j]); + + scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed); + compute_scratch_size = MAX2(compute_scratch_size, + cmd_buffer->compute_scratch_size_needed); + esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed); + gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed); + } + } + + result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, esgs_ring_size, gsvs_ring_size, &preamble_cs); + if (result != VK_SUCCESS) + return result; for (uint32_t i = 0; i < submitCount; i++) { struct radeon_winsys_cs **cs_array; bool can_patch = true; + uint32_t advance; + int draw_cmd_buffers_count = 0; - if (!pSubmits[i].commandBufferCount) + for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, + pSubmits[i].pCommandBuffers[j]); + assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); + if (cmd_buffer->no_draws == true) + continue; + draw_cmd_buffers_count++; + } + + if (!draw_cmd_buffers_count) { + if (pSubmits[i].waitSemaphoreCount || pSubmits[i].signalSemaphoreCount) { + ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, + &queue->device->empty_cs[queue->queue_family_index], + 1, NULL, + (struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores, + pSubmits[i].waitSemaphoreCount, + (struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores, + pSubmits[i].signalSemaphoreCount, + false, base_fence); + if (ret) { + radv_loge("failed to submit CS %d\n", i); + abort(); + } + fence_emitted = true; + } continue; + } - cs_array = malloc(sizeof(struct radeon_winsys_cs *) * - pSubmits[i].commandBufferCount); + cs_array = malloc(sizeof(struct radeon_winsys_cs *) * draw_cmd_buffers_count); + int draw_cmd_buffer_idx = 0; for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pSubmits[i].pCommandBuffers[j]); assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); + if (cmd_buffer->no_draws == true) + continue; - cs_array[j] = cmd_buffer->cs; + cs_array[draw_cmd_buffer_idx] = cmd_buffer->cs; + draw_cmd_buffer_idx++; if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) can_patch = false; } - ret = queue->device->ws->cs_submit(ctx, cs_array, - pSubmits[i].commandBufferCount, - can_patch, base_fence); - if (ret) - radv_loge("failed to submit CS %d\n", i); + + for (uint32_t j = 0; j < draw_cmd_buffers_count; j += advance) { + advance = MIN2(max_cs_submission, + draw_cmd_buffers_count - j); + bool b = j == 0; + bool e = j + advance == draw_cmd_buffers_count; + + if (queue->device->trace_bo) + *queue->device->trace_id_ptr = 0; + + ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, + advance, preamble_cs, + (struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores, + b ? pSubmits[i].waitSemaphoreCount : 0, + (struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores, + e ? pSubmits[i].signalSemaphoreCount : 0, + can_patch, base_fence); + + if (ret) { + radv_loge("failed to submit CS %d\n", i); + abort(); + } + fence_emitted = true; + if (queue->device->trace_bo) { + bool success = queue->device->ws->ctx_wait_idle( + queue->hw_ctx, + radv_queue_family_to_ring( + queue->queue_family_index), + queue->queue_idx); + + if (!success) { /* Hang */ + radv_dump_trace(queue->device, cs_array[j]); + abort(); + } + } + } free(cs_array); } if (fence) { - if (!submitCount) - ret = queue->device->ws->cs_submit(ctx, &queue->device->empty_cs, - 1, false, base_fence); + if (!fence_emitted) + ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, + &queue->device->empty_cs[queue->queue_family_index], + 1, NULL, NULL, 0, NULL, 0, + false, base_fence); fence->submitted = true; } @@ -832,7 +1564,9 @@ VkResult radv_QueueWaitIdle( { RADV_FROM_HANDLE(radv_queue, queue, _queue); - queue->device->ws->ctx_wait_idle(queue->device->hw_ctx); + queue->device->ws->ctx_wait_idle(queue->hw_ctx, + radv_queue_family_to_ring(queue->queue_family_index), + queue->queue_idx); return VK_SUCCESS; } @@ -841,7 +1575,11 @@ VkResult radv_DeviceWaitIdle( { RADV_FROM_HANDLE(radv_device, device, _device); - device->ws->ctx_wait_idle(device->hw_ctx); + for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) { + for (unsigned q = 0; q < device->queue_count[i]; q++) { + radv_QueueWaitIdle(radv_queue_to_handle(&device->queues[i][q])); + } + } return VK_SUCCESS; } @@ -900,20 +1638,21 @@ VkResult radv_AllocateMemory( return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096); - if (pAllocateInfo->memoryTypeIndex == 1 || pAllocateInfo->memoryTypeIndex == 3) + if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_WRITE_COMBINE || + pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_CACHED) domain = RADEON_DOMAIN_GTT; else domain = RADEON_DOMAIN_VRAM; - if (pAllocateInfo->memoryTypeIndex == 0) + if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_VRAM) flags |= RADEON_FLAG_NO_CPU_ACCESS; else flags |= RADEON_FLAG_CPU_ACCESS; - if (pAllocateInfo->memoryTypeIndex == 1) + if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_WRITE_COMBINE) flags |= RADEON_FLAG_GTT_WC; - mem->bo = device->ws->buffer_create(device->ws, alloc_size, 32768, + mem->bo = device->ws->buffer_create(device->ws, alloc_size, 65536, domain, flags); if (!mem->bo) { @@ -1010,16 +1749,7 @@ void radv_GetBufferMemoryRequirements( { RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); - /* The Vulkan spec (git aaed022) says: - * - * memoryTypeBits is a bitfield and contains one bit set for every - * supported memory type for the resource. The bit `1<memoryTypeBits = 0x7; + pMemoryRequirements->memoryTypeBits = (1u << RADV_MEM_TYPE_COUNT) - 1; pMemoryRequirements->size = buffer->size; pMemoryRequirements->alignment = 16; @@ -1032,16 +1762,7 @@ void radv_GetImageMemoryRequirements( { RADV_FROM_HANDLE(radv_image, image, _image); - /* The Vulkan spec (git aaed022) says: - * - * memoryTypeBits is a bitfield and contains one bit set for every - * supported memory type for the resource. The bit `1<memoryTypeBits = 0x7; + pMemoryRequirements->memoryTypeBits = (1u << RADV_MEM_TYPE_COUNT) - 1; pMemoryRequirements->size = image->size; pMemoryRequirements->alignment = image->alignment; @@ -1131,7 +1852,10 @@ VkResult radv_CreateFence( fence->submitted = false; fence->signalled = !!(pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT); fence->fence = device->ws->create_fence(); - + if (!fence->fence) { + vk_free2(&device->alloc, pAllocator, fence); + return VK_ERROR_OUT_OF_HOST_MEMORY; + } *pFence = radv_fence_to_handle(fence); @@ -1231,25 +1955,34 @@ VkResult radv_GetFenceStatus(VkDevice _device, VkFence _fence) // Queue semaphore functions VkResult radv_CreateSemaphore( - VkDevice device, + VkDevice _device, const VkSemaphoreCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSemaphore* pSemaphore) { - /* The DRM execbuffer ioctl always execute in-oder, even between different - * rings. As such, there's nothing to do for the user space semaphore. - */ + RADV_FROM_HANDLE(radv_device, device, _device); + struct radeon_winsys_sem *sem; - *pSemaphore = (VkSemaphore)1; + sem = device->ws->create_sem(device->ws); + if (!sem) + return VK_ERROR_OUT_OF_HOST_MEMORY; + *pSemaphore = (VkSemaphore)sem; return VK_SUCCESS; } void radv_DestroySemaphore( - VkDevice device, - VkSemaphore semaphore, + VkDevice _device, + VkSemaphore _semaphore, const VkAllocationCallbacks* pAllocator) { + RADV_FROM_HANDLE(radv_device, device, _device); + struct radeon_winsys_sem *sem; + if (!_semaphore) + return; + + sem = (struct radeon_winsys_sem *)_semaphore; + device->ws->destroy_sem(sem); } VkResult radv_CreateEvent( @@ -1406,8 +2139,9 @@ radv_initialise_color_surface(struct radv_device *device, va += iview->image->dcc_offset; cb->cb_dcc_base = va >> 8; + uint32_t max_slice = iview->type == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth : iview->layer_count; cb->cb_color_view = S_028C6C_SLICE_START(iview->base_layer) | - S_028C6C_SLICE_MAX(iview->base_layer + iview->extent.depth - 1); + S_028C6C_SLICE_MAX(iview->base_layer + max_slice - 1); cb->micro_tile_mode = iview->image->surface.micro_tile_mode; pitch_tile_max = level_info->nblk_x / 8 - 1; @@ -1430,14 +2164,14 @@ radv_initialise_color_surface(struct radv_device *device, if (iview->image->fmask.size) { va = device->ws->buffer_get_va(iview->bo) + iview->image->offset + iview->image->fmask.offset; - if (device->instance->physicalDevice.rad_info.chip_class >= CIK) + if (device->physical_device->rad_info.chip_class >= CIK) cb->cb_color_pitch |= S_028C64_FMASK_TILE_MAX(iview->image->fmask.pitch_in_pixels / 8 - 1); cb->cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(iview->image->fmask.tile_mode_index); cb->cb_color_fmask = va >> 8; cb->cb_color_fmask_slice = S_028C88_TILE_MAX(iview->image->fmask.slice_tile_max); } else { /* This must be set for fast clear to work without FMASK. */ - if (device->instance->physicalDevice.rad_info.chip_class >= CIK) + if (device->physical_device->rad_info.chip_class >= CIK) cb->cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max); cb->cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index); cb->cb_color_fmask = cb->cb_color_base; @@ -1490,13 +2224,14 @@ radv_initialise_color_surface(struct radv_device *device, if (iview->image->fmask.size) cb->cb_color_info |= S_028C70_COMPRESSION(1); - if (iview->image->cmask.size && device->allow_fast_clears) + if (iview->image->cmask.size && + !(device->debug_flags & RADV_DEBUG_NO_FAST_CLEARS)) cb->cb_color_info |= S_028C70_FAST_CLEAR(1); if (iview->image->surface.dcc_size && level_info->dcc_enabled) cb->cb_color_info |= S_028C70_DCC_ENABLE(1); - if (device->instance->physicalDevice.rad_info.chip_class >= VI) { + if (device->physical_device->rad_info.chip_class >= VI) { unsigned max_uncompressed_block_size = 2; if (iview->image->samples > 1) { if (iview->image->surface.bpe == 1) @@ -1511,7 +2246,7 @@ radv_initialise_color_surface(struct radv_device *device, /* This must be set for fast clear to work without FMASK. */ if (!iview->image->fmask.size && - device->instance->physicalDevice.rad_info.chip_class == SI) { + device->physical_device->rad_info.chip_class == SI) { unsigned bankh = util_logbase2(iview->image->surface.bankh); cb->cb_color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh); } @@ -1558,8 +2293,9 @@ radv_initialise_ds_surface(struct radv_device *device, z_offs += iview->image->surface.level[level].offset; s_offs += iview->image->surface.stencil_level[level].offset; + uint32_t max_slice = iview->type == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth : iview->layer_count; ds->db_depth_view = S_028008_SLICE_START(iview->base_layer) | - S_028008_SLICE_MAX(iview->base_layer + iview->extent.depth - 1); + S_028008_SLICE_MAX(iview->base_layer + max_slice - 1); ds->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1); ds->db_z_info = S_028040_FORMAT(format) | S_028040_ZRANGE_PRECISION(1); @@ -1571,8 +2307,8 @@ radv_initialise_ds_surface(struct radv_device *device, else ds->db_stencil_info = S_028044_FORMAT(V_028044_STENCIL_INVALID); - if (device->instance->physicalDevice.rad_info.chip_class >= CIK) { - struct radeon_info *info = &device->instance->physicalDevice.rad_info; + if (device->physical_device->rad_info.chip_class >= CIK) { + struct radeon_info *info = &device->physical_device->rad_info; unsigned tiling_index = iview->image->surface.tiling_index[level]; unsigned stencil_index = iview->image->surface.stencil_tiling_index[level]; unsigned macro_index = iview->image->surface.macro_tile_index; @@ -1802,14 +2538,7 @@ radv_init_sampler(struct radv_device *device, uint32_t max_aniso = pCreateInfo->anisotropyEnable && pCreateInfo->maxAnisotropy > 1.0 ? (uint32_t) pCreateInfo->maxAnisotropy : 0; uint32_t max_aniso_ratio = radv_tex_aniso_filter(max_aniso); - bool is_vi; - is_vi = (device->instance->physicalDevice.rad_info.chip_class >= VI); - - if (!is_vi && max_aniso > 0) { - radv_finishme("Anisotropic filtering must be disabled manually " - "by the shader on SI-CI when BASE_LEVEL == LAST_LEVEL\n"); - max_aniso = max_aniso_ratio = 0; - } + bool is_vi = (device->physical_device->rad_info.chip_class >= VI); sampler->state[0] = (S_008F30_CLAMP_X(radv_tex_wrap(pCreateInfo->addressModeU)) | S_008F30_CLAMP_Y(radv_tex_wrap(pCreateInfo->addressModeV)) | @@ -1870,3 +2599,48 @@ void radv_DestroySampler( return; vk_free2(&device->alloc, pAllocator, sampler); } + + +/* vk_icd.h does not declare this function, so we declare it here to + * suppress Wmissing-prototypes. + */ +PUBLIC VKAPI_ATTR VkResult VKAPI_CALL +vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t *pSupportedVersion); + +PUBLIC VKAPI_ATTR VkResult VKAPI_CALL +vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t *pSupportedVersion) +{ + /* For the full details on loader interface versioning, see + * . + * What follows is a condensed summary, to help you navigate the large and + * confusing official doc. + * + * - Loader interface v0 is incompatible with later versions. We don't + * support it. + * + * - In loader interface v1: + * - The first ICD entrypoint called by the loader is + * vk_icdGetInstanceProcAddr(). The ICD must statically expose this + * entrypoint. + * - The ICD must statically expose no other Vulkan symbol unless it is + * linked with -Bsymbolic. + * - Each dispatchable Vulkan handle created by the ICD must be + * a pointer to a struct whose first member is VK_LOADER_DATA. The + * ICD must initialize VK_LOADER_DATA.loadMagic to ICD_LOADER_MAGIC. + * - The loader implements vkCreate{PLATFORM}SurfaceKHR() and + * vkDestroySurfaceKHR(). The ICD must be capable of working with + * such loader-managed surfaces. + * + * - Loader interface v2 differs from v1 in: + * - The first ICD entrypoint called by the loader is + * vk_icdNegotiateLoaderICDInterfaceVersion(). The ICD must + * statically expose this entrypoint. + * + * - Loader interface v3 differs from v2 in: + * - The ICD must implement vkCreate{PLATFORM}SurfaceKHR(), + * vkDestroySurfaceKHR(), and other API which uses VKSurfaceKHR, + * because the loader no longer does so. + */ + *pSupportedVersion = MIN2(*pSupportedVersion, 3u); + return VK_SUCCESS; +}