X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_device.c;h=35489f0c9c69d251d4d2500453f5937f35c692c9;hb=d3ab239099e526d8350bcb3ce2746ce18b965ab9;hp=99c56a46dab24d2cf093e8302a0363e521eb7af8;hpb=0eb8b6a3e1e36d0961387e26c8ce03f7e296d81e;p=mesa.git diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 99c56a46dab..35489f0c9c6 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -25,15 +25,15 @@ * IN THE SOFTWARE. */ -#include #include #include #include #include -#include #include "radv_private.h" +#include "radv_cs.h" +#include "util/disk_cache.h" #include "util/strtod.h" - +#include "util/vk_util.h" #include #include #include @@ -43,22 +43,6 @@ #include "vk_format.h" #include "sid.h" #include "util/debug.h" -struct radv_dispatch_table dtable; - -static int -radv_get_function_timestamp(void *ptr, uint32_t* timestamp) -{ - Dl_info info; - struct stat st; - if (!dladdr(ptr, &info) || !info.dli_fname) { - return -1; - } - if (stat(info.dli_fname, &st)) { - return -1; - } - *timestamp = st.st_mtim.tv_sec; - return 0; -} static int radv_device_get_cache_uuid(enum radeon_family family, void *uuid) @@ -66,8 +50,8 @@ radv_device_get_cache_uuid(enum radeon_family family, void *uuid) uint32_t mesa_timestamp, llvm_timestamp; uint16_t f = family; memset(uuid, 0, VK_UUID_SIZE); - if (radv_get_function_timestamp(radv_device_get_cache_uuid, &mesa_timestamp) || - radv_get_function_timestamp(LLVMInitializeAMDGPUTargetInfo, &llvm_timestamp)) + if (!disk_cache_get_function_timestamp(radv_device_get_cache_uuid, &mesa_timestamp) || + !disk_cache_get_function_timestamp(LLVMInitializeAMDGPUTargetInfo, &llvm_timestamp)) return -1; memcpy(uuid, &mesa_timestamp, 4); @@ -77,19 +61,192 @@ radv_device_get_cache_uuid(enum radeon_family family, void *uuid) return 0; } +static void +radv_get_device_uuid(drmDevicePtr device, void *uuid) { + memset(uuid, 0, VK_UUID_SIZE); + memcpy((char*)uuid + 0, &device->businfo.pci->domain, 2); + memcpy((char*)uuid + 2, &device->businfo.pci->bus, 1); + memcpy((char*)uuid + 3, &device->businfo.pci->dev, 1); + memcpy((char*)uuid + 4, &device->businfo.pci->func, 1); +} + +static const VkExtensionProperties instance_extensions[] = { + { + .extensionName = VK_KHR_SURFACE_EXTENSION_NAME, + .specVersion = 25, + }, +#ifdef VK_USE_PLATFORM_XCB_KHR + { + .extensionName = VK_KHR_XCB_SURFACE_EXTENSION_NAME, + .specVersion = 6, + }, +#endif +#ifdef VK_USE_PLATFORM_XLIB_KHR + { + .extensionName = VK_KHR_XLIB_SURFACE_EXTENSION_NAME, + .specVersion = 6, + }, +#endif +#ifdef VK_USE_PLATFORM_WAYLAND_KHR + { + .extensionName = VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME, + .specVersion = 5, + }, +#endif + { + .extensionName = VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHX_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME, + .specVersion = 1, + }, +}; + +static const VkExtensionProperties common_device_extensions[] = { + { + .extensionName = VK_KHR_DESCRIPTOR_UPDATE_TEMPLATE_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHR_INCREMENTAL_PRESENT_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHR_MAINTENANCE1_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHR_SWAPCHAIN_EXTENSION_NAME, + .specVersion = 68, + }, + { + .extensionName = VK_AMD_DRAW_INDIRECT_COUNT_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_NV_DEDICATED_ALLOCATION_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHX_EXTERNAL_MEMORY_EXTENSION_NAME, + .specVersion = 1, + }, + { + .extensionName = VK_KHX_EXTERNAL_MEMORY_FD_EXTENSION_NAME, + .specVersion = 1, + }, +}; + +static VkResult +radv_extensions_register(struct radv_instance *instance, + struct radv_extensions *extensions, + const VkExtensionProperties *new_ext, + uint32_t num_ext) +{ + size_t new_size; + VkExtensionProperties *new_ptr; + + assert(new_ext && num_ext > 0); + + if (!new_ext) + return VK_ERROR_INITIALIZATION_FAILED; + + new_size = (extensions->num_ext + num_ext) * sizeof(VkExtensionProperties); + new_ptr = vk_realloc(&instance->alloc, extensions->ext_array, + new_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + /* Old array continues to be valid, update nothing */ + if (!new_ptr) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + memcpy(&new_ptr[extensions->num_ext], new_ext, + num_ext * sizeof(VkExtensionProperties)); + extensions->ext_array = new_ptr; + extensions->num_ext += num_ext; + + return VK_SUCCESS; +} + +static void +radv_extensions_finish(struct radv_instance *instance, + struct radv_extensions *extensions) +{ + assert(extensions); + + if (!extensions) + radv_loge("Attemted to free invalid extension struct\n"); + + if (extensions->ext_array) + vk_free(&instance->alloc, extensions->ext_array); +} + +static bool +is_extension_enabled(const VkExtensionProperties *extensions, + size_t num_ext, + const char *name) +{ + assert(extensions && name); + + for (uint32_t i = 0; i < num_ext; i++) { + if (strcmp(name, extensions[i].extensionName) == 0) + return true; + } + + return false; +} + +static const char * +get_chip_name(enum radeon_family family) +{ + switch (family) { + case CHIP_TAHITI: return "AMD RADV TAHITI"; + case CHIP_PITCAIRN: return "AMD RADV PITCAIRN"; + case CHIP_VERDE: return "AMD RADV CAPE VERDE"; + case CHIP_OLAND: return "AMD RADV OLAND"; + case CHIP_HAINAN: return "AMD RADV HAINAN"; + case CHIP_BONAIRE: return "AMD RADV BONAIRE"; + case CHIP_KAVERI: return "AMD RADV KAVERI"; + case CHIP_KABINI: return "AMD RADV KABINI"; + case CHIP_HAWAII: return "AMD RADV HAWAII"; + case CHIP_MULLINS: return "AMD RADV MULLINS"; + case CHIP_TONGA: return "AMD RADV TONGA"; + case CHIP_ICELAND: return "AMD RADV ICELAND"; + case CHIP_CARRIZO: return "AMD RADV CARRIZO"; + case CHIP_FIJI: return "AMD RADV FIJI"; + case CHIP_POLARIS10: return "AMD RADV POLARIS10"; + case CHIP_POLARIS11: return "AMD RADV POLARIS11"; + case CHIP_POLARIS12: return "AMD RADV POLARIS12"; + case CHIP_STONEY: return "AMD RADV STONEY"; + default: return "AMD RADV unknown"; + } +} + static VkResult radv_physical_device_init(struct radv_physical_device *device, struct radv_instance *instance, - const char *path) + drmDevicePtr drm_device) { + const char *path = drm_device->nodes[DRM_NODE_RENDER]; VkResult result; drmVersionPtr version; int fd; fd = open(path, O_RDWR | O_CLOEXEC); if (fd < 0) - return vk_errorf(VK_ERROR_INCOMPATIBLE_DRIVER, - "failed to open %s: %m", path); + return VK_ERROR_INCOMPATIBLE_DRIVER; version = drmGetVersion(fd); if (!version) { @@ -110,11 +267,13 @@ radv_physical_device_init(struct radv_physical_device *device, assert(strlen(path) < ARRAY_SIZE(device->path)); strncpy(device->path, path, ARRAY_SIZE(device->path)); - device->ws = radv_amdgpu_winsys_create(fd); + device->ws = radv_amdgpu_winsys_create(fd, instance->debug_flags); if (!device->ws) { result = VK_ERROR_INCOMPATIBLE_DRIVER; goto fail; } + + device->local_fd = fd; device->ws->query_info(device->ws, &device->rad_info); result = radv_init_wsi(device); if (result != VK_SUCCESS) { @@ -130,9 +289,18 @@ radv_physical_device_init(struct radv_physical_device *device, goto fail; } + result = radv_extensions_register(instance, + &device->extensions, + common_device_extensions, + ARRAY_SIZE(common_device_extensions)); + if (result != VK_SUCCESS) + goto fail; + fprintf(stderr, "WARNING: radv is not a conformant vulkan implementation, testing use only.\n"); - device->name = device->rad_info.name; - close(fd); + device->name = get_chip_name(device->rad_info.family); + + radv_get_device_uuid(drm_device, device->device_uuid); + return VK_SUCCESS; fail: @@ -143,54 +311,12 @@ fail: static void radv_physical_device_finish(struct radv_physical_device *device) { + radv_extensions_finish(device->instance, &device->extensions); radv_finish_wsi(device); device->ws->destroy(device->ws); + close(device->local_fd); } -static const VkExtensionProperties global_extensions[] = { - { - .extensionName = VK_KHR_SURFACE_EXTENSION_NAME, - .specVersion = 25, - }, -#ifdef VK_USE_PLATFORM_XCB_KHR - { - .extensionName = VK_KHR_XCB_SURFACE_EXTENSION_NAME, - .specVersion = 6, - }, -#endif -#ifdef VK_USE_PLATFORM_XLIB_KHR - { - .extensionName = VK_KHR_XLIB_SURFACE_EXTENSION_NAME, - .specVersion = 6, - }, -#endif -#ifdef VK_USE_PLATFORM_WAYLAND_KHR - { - .extensionName = VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME, - .specVersion = 5, - }, -#endif -}; - -static const VkExtensionProperties device_extensions[] = { - { - .extensionName = VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME, - .specVersion = 1, - }, - { - .extensionName = VK_KHR_SWAPCHAIN_EXTENSION_NAME, - .specVersion = 68, - }, - { - .extensionName = VK_AMD_DRAW_INDIRECT_COUNT_EXTENSION_NAME, - .specVersion = 1, - }, - { - .extensionName = VK_AMD_NEGATIVE_VIEWPORT_HEIGHT_EXTENSION_NAME, - .specVersion = 1, - }, -}; - static void * default_alloc_func(void *pUserData, size_t size, size_t align, VkSystemAllocationScope allocationScope) @@ -219,7 +345,7 @@ static const VkAllocationCallbacks default_alloc = { }; static const struct debug_control radv_debug_options[] = { - {"fastclears", RADV_DEBUG_FAST_CLEARS}, + {"nofastclears", RADV_DEBUG_NO_FAST_CLEARS}, {"nodcc", RADV_DEBUG_NO_DCC}, {"shaders", RADV_DEBUG_DUMP_SHADERS}, {"nocache", RADV_DEBUG_NO_CACHE}, @@ -227,6 +353,8 @@ static const struct debug_control radv_debug_options[] = { {"nohiz", RADV_DEBUG_NO_HIZ}, {"nocompute", RADV_DEBUG_NO_COMPUTE_QUEUE}, {"unsafemath", RADV_DEBUG_UNSAFE_MATH}, + {"allbos", RADV_DEBUG_ALL_BOS}, + {"noibs", RADV_DEBUG_NO_IBS}, {NULL, 0} }; @@ -257,15 +385,9 @@ VkResult radv_CreateInstance( } for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) { - bool found = false; - for (uint32_t j = 0; j < ARRAY_SIZE(global_extensions); j++) { - if (strcmp(pCreateInfo->ppEnabledExtensionNames[i], - global_extensions[j].extensionName) == 0) { - found = true; - break; - } - } - if (!found) + if (!is_extension_enabled(instance_extensions, + ARRAY_SIZE(instance_extensions), + pCreateInfo->ppEnabledExtensionNames[i])) return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT); } @@ -274,6 +396,8 @@ VkResult radv_CreateInstance( if (!instance) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + memset(instance, 0, sizeof(*instance)); + instance->_loader_data.loaderMagic = ICD_LOADER_MAGIC; if (pAllocator) @@ -302,10 +426,11 @@ void radv_DestroyInstance( { RADV_FROM_HANDLE(radv_instance, instance, _instance); - if (instance->physicalDeviceCount > 0) { - /* We support at most one physical device. */ - assert(instance->physicalDeviceCount == 1); - radv_physical_device_finish(&instance->physicalDevice); + if (!instance) + return; + + for (int i = 0; i < instance->physicalDeviceCount; ++i) { + radv_physical_device_finish(instance->physicalDevices + i); } VG(VALGRIND_DESTROY_MEMPOOL(instance)); @@ -315,6 +440,40 @@ void radv_DestroyInstance( vk_free(&instance->alloc, instance); } +static VkResult +radv_enumerate_devices(struct radv_instance *instance) +{ + /* TODO: Check for more devices ? */ + drmDevicePtr devices[8]; + VkResult result = VK_ERROR_INCOMPATIBLE_DRIVER; + int max_devices; + + instance->physicalDeviceCount = 0; + + max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices)); + if (max_devices < 1) + return VK_ERROR_INCOMPATIBLE_DRIVER; + + for (unsigned i = 0; i < (unsigned)max_devices; i++) { + if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER && + devices[i]->bustype == DRM_BUS_PCI && + devices[i]->deviceinfo.pci->vendor_id == 0x1002) { + + result = radv_physical_device_init(instance->physicalDevices + + instance->physicalDeviceCount, + instance, + devices[i]); + if (result == VK_SUCCESS) + ++instance->physicalDeviceCount; + else if (result != VK_ERROR_INCOMPATIBLE_DRIVER) + break; + } + } + drmFreeDevices(devices, max_devices); + + return result; +} + VkResult radv_EnumeratePhysicalDevices( VkInstance _instance, uint32_t* pPhysicalDeviceCount, @@ -324,53 +483,22 @@ VkResult radv_EnumeratePhysicalDevices( VkResult result; if (instance->physicalDeviceCount < 0) { - char path[20]; - for (unsigned i = 0; i < 8; i++) { - snprintf(path, sizeof(path), "/dev/dri/renderD%d", 128 + i); - result = radv_physical_device_init(&instance->physicalDevice, - instance, path); - if (result != VK_ERROR_INCOMPATIBLE_DRIVER) - break; - } - - if (result == VK_ERROR_INCOMPATIBLE_DRIVER) { - instance->physicalDeviceCount = 0; - } else if (result == VK_SUCCESS) { - instance->physicalDeviceCount = 1; - } else { + result = radv_enumerate_devices(instance); + if (result != VK_SUCCESS && + result != VK_ERROR_INCOMPATIBLE_DRIVER) return result; - } } - /* pPhysicalDeviceCount is an out parameter if pPhysicalDevices is NULL; - * otherwise it's an inout parameter. - * - * The Vulkan spec (git aaed022) says: - * - * pPhysicalDeviceCount is a pointer to an unsigned integer variable - * that is initialized with the number of devices the application is - * prepared to receive handles to. pname:pPhysicalDevices is pointer to - * an array of at least this many VkPhysicalDevice handles [...]. - * - * Upon success, if pPhysicalDevices is NULL, vkEnumeratePhysicalDevices - * overwrites the contents of the variable pointed to by - * pPhysicalDeviceCount with the number of physical devices in in the - * instance; otherwise, vkEnumeratePhysicalDevices overwrites - * pPhysicalDeviceCount with the number of physical handles written to - * pPhysicalDevices. - */ if (!pPhysicalDevices) { *pPhysicalDeviceCount = instance->physicalDeviceCount; - } else if (*pPhysicalDeviceCount >= 1) { - pPhysicalDevices[0] = radv_physical_device_to_handle(&instance->physicalDevice); - *pPhysicalDeviceCount = 1; - } else if (*pPhysicalDeviceCount < instance->physicalDeviceCount) { - return VK_INCOMPLETE; } else { - *pPhysicalDeviceCount = 0; + *pPhysicalDeviceCount = MIN2(*pPhysicalDeviceCount, instance->physicalDeviceCount); + for (unsigned i = 0; i < *pPhysicalDeviceCount; ++i) + pPhysicalDevices[i] = radv_physical_device_to_handle(instance->physicalDevices + i); } - return VK_SUCCESS; + return *pPhysicalDeviceCount < instance->physicalDeviceCount ? VK_INCOMPLETE + : VK_SUCCESS; } void radv_GetPhysicalDeviceFeatures( @@ -386,8 +514,8 @@ void radv_GetPhysicalDeviceFeatures( .fullDrawIndexUint32 = true, .imageCubeArray = true, .independentBlend = true, - .geometryShader = false, - .tessellationShader = false, + .geometryShader = true, + .tessellationShader = true, .sampleRateShading = false, .dualSrcBlend = true, .logicOp = true, @@ -400,13 +528,13 @@ void radv_GetPhysicalDeviceFeatures( .wideLines = true, .largePoints = true, .alphaToOne = true, - .multiViewport = false, + .multiViewport = true, .samplerAnisotropy = true, .textureCompressionETC2 = false, .textureCompressionASTC_LDR = false, .textureCompressionBC = true, .occlusionQueryPrecise = true, - .pipelineStatisticsQuery = false, + .pipelineStatisticsQuery = true, .vertexPipelineStoresAndAtomics = true, .fragmentStoresAndAtomics = true, .shaderTessellationAndGeometryPointSize = true, @@ -417,25 +545,68 @@ void radv_GetPhysicalDeviceFeatures( .shaderSampledImageArrayDynamicIndexing = true, .shaderStorageBufferArrayDynamicIndexing = true, .shaderStorageImageArrayDynamicIndexing = true, - .shaderStorageImageReadWithoutFormat = false, - .shaderStorageImageWriteWithoutFormat = false, + .shaderStorageImageReadWithoutFormat = true, + .shaderStorageImageWriteWithoutFormat = true, .shaderClipDistance = true, .shaderCullDistance = true, - .shaderFloat64 = false, + .shaderFloat64 = true, .shaderInt64 = false, .shaderInt16 = false, - .alphaToOne = true, - .variableMultisampleRate = false, - .inheritedQueries = false, + .sparseBinding = true, + .variableMultisampleRate = true, + .inheritedQueries = true, }; } +void radv_GetPhysicalDeviceFeatures2KHR( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceFeatures2KHR *pFeatures) +{ + return radv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features); +} + +static uint32_t radv_get_driver_version() +{ + const char *minor_string = strchr(VERSION, '.'); + const char *patch_string = minor_string ? strchr(minor_string + 1, ','): NULL; + int major = atoi(VERSION); + int minor = minor_string ? atoi(minor_string + 1) : 0; + int patch = patch_string ? atoi(patch_string + 1) : 0; + if (strstr(VERSION, "devel")) { + if (patch == 0) { + patch = 99; + if (minor == 0) { + minor = 99; + --major; + } else + --minor; + } else + --patch; + } + uint32_t version = VK_MAKE_VERSION(major, minor, patch); + return version; +} + void radv_GetPhysicalDeviceProperties( VkPhysicalDevice physicalDevice, VkPhysicalDeviceProperties* pProperties) { RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); VkSampleCountFlags sample_counts = 0xf; + + /* make sure that the entire descriptor set is addressable with a signed + * 32-bit int. So the sum of all limits scaled by descriptor size has to + * be at most 2 GiB. the combined image & samples object count as one of + * both. This limit is for the pipeline layout, not for the set layout, but + * there is no set limit, so we just set a pipeline limit. I don't think + * any app is going to hit this soon. */ + size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) / + (32 /* uniform buffer, 32 due to potential space wasted on alignement */ + + 32 /* storage buffer, 32 due to potential space wasted on alignement */ + + 32 /* sampler, largest when combined with image */ + + 64 /* sampled image */ + + 64 /* storage image */); + VkPhysicalDeviceLimits limits = { .maxImageDimension1D = (1 << 14), .maxImageDimension2D = (1 << 14), @@ -449,37 +620,37 @@ void radv_GetPhysicalDeviceProperties( .maxMemoryAllocationCount = UINT32_MAX, .maxSamplerAllocationCount = 64 * 1024, .bufferImageGranularity = 64, /* A cache line */ - .sparseAddressSpaceSize = 0, + .sparseAddressSpaceSize = 0xffffffffu, /* buffer max size */ .maxBoundDescriptorSets = MAX_SETS, - .maxPerStageDescriptorSamplers = 64, - .maxPerStageDescriptorUniformBuffers = 64, - .maxPerStageDescriptorStorageBuffers = 64, - .maxPerStageDescriptorSampledImages = 64, - .maxPerStageDescriptorStorageImages = 64, - .maxPerStageDescriptorInputAttachments = 64, - .maxPerStageResources = 128, - .maxDescriptorSetSamplers = 256, - .maxDescriptorSetUniformBuffers = 256, - .maxDescriptorSetUniformBuffersDynamic = 256, - .maxDescriptorSetStorageBuffers = 256, - .maxDescriptorSetStorageBuffersDynamic = 256, - .maxDescriptorSetSampledImages = 256, - .maxDescriptorSetStorageImages = 256, - .maxDescriptorSetInputAttachments = 256, + .maxPerStageDescriptorSamplers = max_descriptor_set_size, + .maxPerStageDescriptorUniformBuffers = max_descriptor_set_size, + .maxPerStageDescriptorStorageBuffers = max_descriptor_set_size, + .maxPerStageDescriptorSampledImages = max_descriptor_set_size, + .maxPerStageDescriptorStorageImages = max_descriptor_set_size, + .maxPerStageDescriptorInputAttachments = max_descriptor_set_size, + .maxPerStageResources = max_descriptor_set_size, + .maxDescriptorSetSamplers = max_descriptor_set_size, + .maxDescriptorSetUniformBuffers = max_descriptor_set_size, + .maxDescriptorSetUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2, + .maxDescriptorSetStorageBuffers = max_descriptor_set_size, + .maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2, + .maxDescriptorSetSampledImages = max_descriptor_set_size, + .maxDescriptorSetStorageImages = max_descriptor_set_size, + .maxDescriptorSetInputAttachments = max_descriptor_set_size, .maxVertexInputAttributes = 32, .maxVertexInputBindings = 32, .maxVertexInputAttributeOffset = 2047, .maxVertexInputBindingStride = 2048, .maxVertexOutputComponents = 128, - .maxTessellationGenerationLevel = 0, - .maxTessellationPatchSize = 0, - .maxTessellationControlPerVertexInputComponents = 0, - .maxTessellationControlPerVertexOutputComponents = 0, - .maxTessellationControlPerPatchOutputComponents = 0, - .maxTessellationControlTotalOutputComponents = 0, - .maxTessellationEvaluationInputComponents = 0, - .maxTessellationEvaluationOutputComponents = 0, - .maxGeometryShaderInvocations = 32, + .maxTessellationGenerationLevel = 64, + .maxTessellationPatchSize = 32, + .maxTessellationControlPerVertexInputComponents = 128, + .maxTessellationControlPerVertexOutputComponents = 128, + .maxTessellationControlPerPatchOutputComponents = 120, + .maxTessellationControlTotalOutputComponents = 4096, + .maxTessellationEvaluationInputComponents = 128, + .maxTessellationEvaluationOutputComponents = 128, + .maxGeometryShaderInvocations = 127, .maxGeometryInputComponents = 64, .maxGeometryOutputComponents = 128, .maxGeometryOutputVertices = 256, @@ -532,8 +703,8 @@ void radv_GetPhysicalDeviceProperties( .sampledImageStencilSampleCounts = sample_counts, .storageImageSampleCounts = VK_SAMPLE_COUNT_1_BIT, .maxSampleMaskWords = 1, - .timestampComputeAndGraphics = false, - .timestampPeriod = 100000.0 / pdevice->rad_info.clock_crystal_freq, + .timestampComputeAndGraphics = true, + .timestampPeriod = 1000000.0 / pdevice->rad_info.clock_crystal_freq, .maxClipDistances = 8, .maxCullDistances = 8, .maxCombinedClipAndCullDistances = 8, @@ -550,28 +721,55 @@ void radv_GetPhysicalDeviceProperties( }; *pProperties = (VkPhysicalDeviceProperties) { - .apiVersion = VK_MAKE_VERSION(1, 0, 5), - .driverVersion = 1, + .apiVersion = VK_MAKE_VERSION(1, 0, 42), + .driverVersion = radv_get_driver_version(), .vendorID = 0x1002, .deviceID = pdevice->rad_info.pci_id, .deviceType = VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU, .limits = limits, - .sparseProperties = {0}, /* Broadwell doesn't do sparse. */ + .sparseProperties = {0}, }; strcpy(pProperties->deviceName, pdevice->name); memcpy(pProperties->pipelineCacheUUID, pdevice->uuid, VK_UUID_SIZE); } -void radv_GetPhysicalDeviceQueueFamilyProperties( +void radv_GetPhysicalDeviceProperties2KHR( VkPhysicalDevice physicalDevice, - uint32_t* pCount, - VkQueueFamilyProperties* pQueueFamilyProperties) + VkPhysicalDeviceProperties2KHR *pProperties) { RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); + radv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties); + + vk_foreach_struct(ext, pProperties->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR: { + VkPhysicalDevicePushDescriptorPropertiesKHR *properties = + (VkPhysicalDevicePushDescriptorPropertiesKHR *) ext; + properties->maxPushDescriptors = MAX_PUSH_DESCRIPTORS; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHX: { + VkPhysicalDeviceIDPropertiesKHX *properties = (VkPhysicalDeviceIDPropertiesKHX*)ext; + radv_device_get_cache_uuid(0, properties->driverUUID); + memcpy(properties->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE); + properties->deviceLUIDValid = false; + break; + } + default: + break; + } + } +} + +static void radv_get_physical_device_queue_family_properties( + struct radv_physical_device* pdevice, + uint32_t* pCount, + VkQueueFamilyProperties** pQueueFamilyProperties) +{ int num_queue_families = 1; int idx; - if (pdevice->rad_info.compute_rings > 0 && + if (pdevice->rad_info.num_compute_rings > 0 && pdevice->rad_info.chip_class >= CIK && !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) num_queue_families++; @@ -586,10 +784,11 @@ void radv_GetPhysicalDeviceQueueFamilyProperties( idx = 0; if (*pCount >= 1) { - pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { + *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { .queueFlags = VK_QUEUE_GRAPHICS_BIT | - VK_QUEUE_COMPUTE_BIT | - VK_QUEUE_TRANSFER_BIT, + VK_QUEUE_COMPUTE_BIT | + VK_QUEUE_TRANSFER_BIT | + VK_QUEUE_SPARSE_BINDING_BIT, .queueCount = 1, .timestampValidBits = 64, .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 }, @@ -597,13 +796,15 @@ void radv_GetPhysicalDeviceQueueFamilyProperties( idx++; } - if (pdevice->rad_info.compute_rings > 0 && + if (pdevice->rad_info.num_compute_rings > 0 && pdevice->rad_info.chip_class >= CIK && !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) { if (*pCount > idx) { - pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { - .queueFlags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT, - .queueCount = pdevice->rad_info.compute_rings, + *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { + .queueFlags = VK_QUEUE_COMPUTE_BIT | + VK_QUEUE_TRANSFER_BIT | + VK_QUEUE_SPARSE_BINDING_BIT, + .queueCount = pdevice->rad_info.num_compute_rings, .timestampValidBits = 64, .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 }, }; @@ -613,9 +814,47 @@ void radv_GetPhysicalDeviceQueueFamilyProperties( *pCount = idx; } +void radv_GetPhysicalDeviceQueueFamilyProperties( + VkPhysicalDevice physicalDevice, + uint32_t* pCount, + VkQueueFamilyProperties* pQueueFamilyProperties) +{ + RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); + if (!pQueueFamilyProperties) { + return radv_get_physical_device_queue_family_properties(pdevice, pCount, NULL); + return; + } + VkQueueFamilyProperties *properties[] = { + pQueueFamilyProperties + 0, + pQueueFamilyProperties + 1, + pQueueFamilyProperties + 2, + }; + radv_get_physical_device_queue_family_properties(pdevice, pCount, properties); + assert(*pCount <= 3); +} + +void radv_GetPhysicalDeviceQueueFamilyProperties2KHR( + VkPhysicalDevice physicalDevice, + uint32_t* pCount, + VkQueueFamilyProperties2KHR *pQueueFamilyProperties) +{ + RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); + if (!pQueueFamilyProperties) { + return radv_get_physical_device_queue_family_properties(pdevice, pCount, NULL); + return; + } + VkQueueFamilyProperties *properties[] = { + &pQueueFamilyProperties[0].queueFamilyProperties, + &pQueueFamilyProperties[1].queueFamilyProperties, + &pQueueFamilyProperties[2].queueFamilyProperties, + }; + radv_get_physical_device_queue_family_properties(pdevice, pCount, properties); + assert(*pCount <= 3); +} + void radv_GetPhysicalDeviceMemoryProperties( VkPhysicalDevice physicalDevice, - VkPhysicalDeviceMemoryProperties* pMemoryProperties) + VkPhysicalDeviceMemoryProperties *pMemoryProperties) { RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice); @@ -649,11 +888,11 @@ void radv_GetPhysicalDeviceMemoryProperties( pMemoryProperties->memoryHeapCount = RADV_MEM_HEAP_COUNT; pMemoryProperties->memoryHeaps[RADV_MEM_HEAP_VRAM] = (VkMemoryHeap) { .size = physical_device->rad_info.vram_size - - physical_device->rad_info.visible_vram_size, + physical_device->rad_info.vram_vis_size, .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, }; pMemoryProperties->memoryHeaps[RADV_MEM_HEAP_VRAM_CPU_ACCESS] = (VkMemoryHeap) { - .size = physical_device->rad_info.visible_vram_size, + .size = physical_device->rad_info.vram_vis_size, .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, }; pMemoryProperties->memoryHeaps[RADV_MEM_HEAP_GTT] = (VkMemoryHeap) { @@ -662,6 +901,14 @@ void radv_GetPhysicalDeviceMemoryProperties( }; } +void radv_GetPhysicalDeviceMemoryProperties2KHR( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceMemoryProperties2KHR *pMemoryProperties) +{ + return radv_GetPhysicalDeviceMemoryProperties(physicalDevice, + &pMemoryProperties->memoryProperties); +} + static int radv_queue_init(struct radv_device *device, struct radv_queue *queue, int queue_family_index, int idx) @@ -683,6 +930,56 @@ radv_queue_finish(struct radv_queue *queue) { if (queue->hw_ctx) queue->device->ws->ctx_destroy(queue->hw_ctx); + + if (queue->initial_preamble_cs) + queue->device->ws->cs_destroy(queue->initial_preamble_cs); + if (queue->continue_preamble_cs) + queue->device->ws->cs_destroy(queue->continue_preamble_cs); + if (queue->descriptor_bo) + queue->device->ws->buffer_destroy(queue->descriptor_bo); + if (queue->scratch_bo) + queue->device->ws->buffer_destroy(queue->scratch_bo); + if (queue->esgs_ring_bo) + queue->device->ws->buffer_destroy(queue->esgs_ring_bo); + if (queue->gsvs_ring_bo) + queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); + if (queue->tess_factor_ring_bo) + queue->device->ws->buffer_destroy(queue->tess_factor_ring_bo); + if (queue->tess_offchip_ring_bo) + queue->device->ws->buffer_destroy(queue->tess_offchip_ring_bo); + if (queue->compute_scratch_bo) + queue->device->ws->buffer_destroy(queue->compute_scratch_bo); +} + +static void +radv_device_init_gs_info(struct radv_device *device) +{ + switch (device->physical_device->rad_info.family) { + case CHIP_OLAND: + case CHIP_HAINAN: + case CHIP_KAVERI: + case CHIP_KABINI: + case CHIP_MULLINS: + case CHIP_ICELAND: + case CHIP_CARRIZO: + case CHIP_STONEY: + device->gs_table_depth = 16; + return; + case CHIP_TAHITI: + case CHIP_PITCAIRN: + case CHIP_VERDE: + case CHIP_BONAIRE: + case CHIP_HAWAII: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + case CHIP_POLARIS12: + device->gs_table_depth = 32; + return; + default: + unreachable("unknown GPU"); + } } VkResult radv_CreateDevice( @@ -696,15 +993,9 @@ VkResult radv_CreateDevice( struct radv_device *device; for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) { - bool found = false; - for (uint32_t j = 0; j < ARRAY_SIZE(device_extensions); j++) { - if (strcmp(pCreateInfo->ppEnabledExtensionNames[i], - device_extensions[j].extensionName) == 0) { - found = true; - break; - } - } - if (!found) + if (!is_extension_enabled(physical_device->extensions.ext_array, + physical_device->extensions.num_ext, + pCreateInfo->ppEnabledExtensionNames[i])) return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT); } @@ -718,6 +1009,7 @@ VkResult radv_CreateDevice( device->_loader_data.loaderMagic = ICD_LOADER_MAGIC; device->instance = physical_device->instance; + device->physical_device = physical_device; device->debug_flags = device->instance->debug_flags; @@ -749,6 +1041,36 @@ VkResult radv_CreateDevice( } } +#if HAVE_LLVM < 0x0400 + device->llvm_supports_spill = false; +#else + device->llvm_supports_spill = true; +#endif + + /* The maximum number of scratch waves. Scratch space isn't divided + * evenly between CUs. The number is only a function of the number of CUs. + * We can decrease the constant to decrease the scratch buffer size. + * + * sctx->scratch_waves must be >= the maximum posible size of + * 1 threadgroup, so that the hw doesn't hang from being unable + * to start any. + * + * The recommended value is 4 per CU at most. Higher numbers don't + * bring much benefit, but they still occupy chip resources (think + * async compute). I've seen ~2% performance difference between 4 and 32. + */ + uint32_t max_threads_per_block = 2048; + device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units, + max_threads_per_block / 64); + + radv_device_init_gs_info(device); + + device->tess_offchip_block_dw_size = + device->physical_device->rad_info.family == CHIP_HAWAII ? 4096 : 8192; + device->has_distributed_tess = + device->physical_device->rad_info.chip_class >= VI && + device->physical_device->rad_info.max_se >= 2; + result = radv_device_init_meta(device); if (result != VK_SUCCESS) goto fail; @@ -769,6 +1091,37 @@ VkResult radv_CreateDevice( break; } device->ws->cs_finalize(device->empty_cs[family]); + + device->flush_cs[family] = device->ws->cs_create(device->ws, family); + switch (family) { + case RADV_QUEUE_GENERAL: + case RADV_QUEUE_COMPUTE: + si_cs_emit_cache_flush(device->flush_cs[family], + device->physical_device->rad_info.chip_class, + family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK, + RADV_CMD_FLAG_INV_ICACHE | + RADV_CMD_FLAG_INV_SMEM_L1 | + RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_GLOBAL_L2); + break; + } + device->ws->cs_finalize(device->flush_cs[family]); + + device->flush_shader_cs[family] = device->ws->cs_create(device->ws, family); + switch (family) { + case RADV_QUEUE_GENERAL: + case RADV_QUEUE_COMPUTE: + si_cs_emit_cache_flush(device->flush_shader_cs[family], + device->physical_device->rad_info.chip_class, + family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK, + family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH) | + RADV_CMD_FLAG_INV_ICACHE | + RADV_CMD_FLAG_INV_SMEM_L1 | + RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_GLOBAL_L2); + break; + } + device->ws->cs_finalize(device->flush_shader_cs[family]); } if (getenv("RADV_TRACE_FILE")) { @@ -782,6 +1135,23 @@ VkResult radv_CreateDevice( goto fail; } + if (device->physical_device->rad_info.chip_class >= CIK) + cik_create_gfx_config(device); + + VkPipelineCacheCreateInfo ci; + ci.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; + ci.pNext = NULL; + ci.flags = 0; + ci.pInitialData = NULL; + ci.initialDataSize = 0; + VkPipelineCache pc; + result = radv_CreatePipelineCache(radv_device_to_handle(device), + &ci, NULL, &pc); + if (result != VK_SUCCESS) + goto fail; + + device->mem_cache = radv_pipeline_cache_from_handle(pc); + *pDevice = radv_device_to_handle(device); return VK_SUCCESS; @@ -789,6 +1159,9 @@ fail: if (device->trace_bo) device->ws->buffer_destroy(device->trace_bo); + if (device->gfx_init) + device->ws->buffer_destroy(device->gfx_init); + for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) { for (unsigned q = 0; q < device->queue_count[i]; q++) radv_queue_finish(&device->queues[i][q]); @@ -806,17 +1179,32 @@ void radv_DestroyDevice( { RADV_FROM_HANDLE(radv_device, device, _device); + if (!device) + return; + if (device->trace_bo) device->ws->buffer_destroy(device->trace_bo); + if (device->gfx_init) + device->ws->buffer_destroy(device->gfx_init); + for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) { for (unsigned q = 0; q < device->queue_count[i]; q++) radv_queue_finish(&device->queues[i][q]); if (device->queue_count[i]) vk_free(&device->alloc, device->queues[i]); + if (device->empty_cs[i]) + device->ws->cs_destroy(device->empty_cs[i]); + if (device->flush_cs[i]) + device->ws->cs_destroy(device->flush_cs[i]); + if (device->flush_shader_cs[i]) + device->ws->cs_destroy(device->flush_shader_cs[i]); } radv_device_finish_meta(device); + VkPipelineCache pc = radv_pipeline_cache_to_handle(device->mem_cache); + radv_DestroyPipelineCache(radv_device_to_handle(device), pc, NULL); + vk_free(&device->alloc, device); } @@ -826,14 +1214,14 @@ VkResult radv_EnumerateInstanceExtensionProperties( VkExtensionProperties* pProperties) { if (pProperties == NULL) { - *pPropertyCount = ARRAY_SIZE(global_extensions); + *pPropertyCount = ARRAY_SIZE(instance_extensions); return VK_SUCCESS; } - *pPropertyCount = MIN2(*pPropertyCount, ARRAY_SIZE(global_extensions)); - typed_memcpy(pProperties, global_extensions, *pPropertyCount); + *pPropertyCount = MIN2(*pPropertyCount, ARRAY_SIZE(instance_extensions)); + typed_memcpy(pProperties, instance_extensions, *pPropertyCount); - if (*pPropertyCount < ARRAY_SIZE(global_extensions)) + if (*pPropertyCount < ARRAY_SIZE(instance_extensions)) return VK_INCOMPLETE; return VK_SUCCESS; @@ -845,15 +1233,17 @@ VkResult radv_EnumerateDeviceExtensionProperties( uint32_t* pPropertyCount, VkExtensionProperties* pProperties) { + RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); + if (pProperties == NULL) { - *pPropertyCount = ARRAY_SIZE(device_extensions); + *pPropertyCount = pdevice->extensions.num_ext; return VK_SUCCESS; } - *pPropertyCount = MIN2(*pPropertyCount, ARRAY_SIZE(device_extensions)); - typed_memcpy(pProperties, device_extensions, *pPropertyCount); + *pPropertyCount = MIN2(*pPropertyCount, pdevice->extensions.num_ext); + typed_memcpy(pProperties, pdevice->extensions.ext_array, *pPropertyCount); - if (*pPropertyCount < ARRAY_SIZE(device_extensions)) + if (*pPropertyCount < pdevice->extensions.num_ext) return VK_INCOMPLETE; return VK_SUCCESS; @@ -912,6 +1302,557 @@ static void radv_dump_trace(struct radv_device *device, fclose(f); } +static void +fill_geom_tess_rings(struct radv_queue *queue, + uint32_t *map, + bool add_sample_positions, + uint32_t esgs_ring_size, + struct radeon_winsys_bo *esgs_ring_bo, + uint32_t gsvs_ring_size, + struct radeon_winsys_bo *gsvs_ring_bo, + uint32_t tess_factor_ring_size, + struct radeon_winsys_bo *tess_factor_ring_bo, + uint32_t tess_offchip_ring_size, + struct radeon_winsys_bo *tess_offchip_ring_bo) +{ + uint64_t esgs_va = 0, gsvs_va = 0; + uint64_t tess_factor_va = 0, tess_offchip_va = 0; + uint32_t *desc = &map[4]; + + if (esgs_ring_bo) + esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo); + if (gsvs_ring_bo) + gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo); + if (tess_factor_ring_bo) + tess_factor_va = queue->device->ws->buffer_get_va(tess_factor_ring_bo); + if (tess_offchip_ring_bo) + tess_offchip_va = queue->device->ws->buffer_get_va(tess_offchip_ring_bo); + + /* stride 0, num records - size, add tid, swizzle, elsize4, + index stride 64 */ + desc[0] = esgs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32) | + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(true); + desc[2] = esgs_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(1) | + S_008F0C_INDEX_STRIDE(3) | + S_008F0C_ADD_TID_ENABLE(true); + + desc += 4; + /* GS entry for ES->GS ring */ + /* stride 0, num records - size, elsize0, + index stride 0 */ + desc[0] = esgs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32)| + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(false); + desc[2] = esgs_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(0) | + S_008F0C_INDEX_STRIDE(0) | + S_008F0C_ADD_TID_ENABLE(false); + + desc += 4; + /* VS entry for GS->VS ring */ + /* stride 0, num records - size, elsize0, + index stride 0 */ + desc[0] = gsvs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)| + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(false); + desc[2] = gsvs_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(0) | + S_008F0C_INDEX_STRIDE(0) | + S_008F0C_ADD_TID_ENABLE(false); + desc += 4; + + /* stride gsvs_itemsize, num records 64 + elsize 4, index stride 16 */ + /* shader will patch stride and desc[2] */ + desc[0] = gsvs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)| + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(true); + desc[2] = 0; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(1) | + S_008F0C_INDEX_STRIDE(1) | + S_008F0C_ADD_TID_ENABLE(true); + desc += 4; + + desc[0] = tess_factor_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(tess_factor_va >> 32) | + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(false); + desc[2] = tess_factor_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(0) | + S_008F0C_INDEX_STRIDE(0) | + S_008F0C_ADD_TID_ENABLE(false); + desc += 4; + + desc[0] = tess_offchip_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(tess_offchip_va >> 32) | + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(false); + desc[2] = tess_offchip_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(0) | + S_008F0C_INDEX_STRIDE(0) | + S_008F0C_ADD_TID_ENABLE(false); + desc += 4; + + /* add sample positions after all rings */ + memcpy(desc, queue->device->sample_locations_1x, 8); + desc += 2; + memcpy(desc, queue->device->sample_locations_2x, 16); + desc += 4; + memcpy(desc, queue->device->sample_locations_4x, 32); + desc += 8; + memcpy(desc, queue->device->sample_locations_8x, 64); + desc += 16; + memcpy(desc, queue->device->sample_locations_16x, 128); +} + +static unsigned +radv_get_hs_offchip_param(struct radv_device *device, uint32_t *max_offchip_buffers_p) +{ + bool double_offchip_buffers = device->physical_device->rad_info.chip_class >= CIK && + device->physical_device->rad_info.family != CHIP_CARRIZO && + device->physical_device->rad_info.family != CHIP_STONEY; + unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64; + unsigned max_offchip_buffers = max_offchip_buffers_per_se * + device->physical_device->rad_info.max_se; + unsigned offchip_granularity; + unsigned hs_offchip_param; + switch (device->tess_offchip_block_dw_size) { + default: + assert(0); + /* fall through */ + case 8192: + offchip_granularity = V_03093C_X_8K_DWORDS; + break; + case 4096: + offchip_granularity = V_03093C_X_4K_DWORDS; + break; + } + + switch (device->physical_device->rad_info.chip_class) { + case SI: + max_offchip_buffers = MIN2(max_offchip_buffers, 126); + break; + case CIK: + max_offchip_buffers = MIN2(max_offchip_buffers, 508); + break; + case VI: + default: + max_offchip_buffers = MIN2(max_offchip_buffers, 512); + break; + } + + *max_offchip_buffers_p = max_offchip_buffers; + if (device->physical_device->rad_info.chip_class >= CIK) { + if (device->physical_device->rad_info.chip_class >= VI) + --max_offchip_buffers; + hs_offchip_param = + S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) | + S_03093C_OFFCHIP_GRANULARITY(offchip_granularity); + } else { + hs_offchip_param = + S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers); + } + return hs_offchip_param; +} + +static VkResult +radv_get_preamble_cs(struct radv_queue *queue, + uint32_t scratch_size, + uint32_t compute_scratch_size, + uint32_t esgs_ring_size, + uint32_t gsvs_ring_size, + bool needs_tess_rings, + bool needs_sample_positions, + struct radeon_winsys_cs **initial_preamble_cs, + struct radeon_winsys_cs **continue_preamble_cs) +{ + struct radeon_winsys_bo *scratch_bo = NULL; + struct radeon_winsys_bo *descriptor_bo = NULL; + struct radeon_winsys_bo *compute_scratch_bo = NULL; + struct radeon_winsys_bo *esgs_ring_bo = NULL; + struct radeon_winsys_bo *gsvs_ring_bo = NULL; + struct radeon_winsys_bo *tess_factor_ring_bo = NULL; + struct radeon_winsys_bo *tess_offchip_ring_bo = NULL; + struct radeon_winsys_cs *dest_cs[2] = {0}; + bool add_tess_rings = false, add_sample_positions = false; + unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0; + unsigned max_offchip_buffers; + unsigned hs_offchip_param = 0; + if (!queue->has_tess_rings) { + if (needs_tess_rings) + add_tess_rings = true; + } + if (!queue->has_sample_positions) { + if (needs_sample_positions) + add_sample_positions = true; + } + tess_factor_ring_size = 32768 * queue->device->physical_device->rad_info.max_se; + hs_offchip_param = radv_get_hs_offchip_param(queue->device, + &max_offchip_buffers); + tess_offchip_ring_size = max_offchip_buffers * + queue->device->tess_offchip_block_dw_size * 4; + + if (scratch_size <= queue->scratch_size && + compute_scratch_size <= queue->compute_scratch_size && + esgs_ring_size <= queue->esgs_ring_size && + gsvs_ring_size <= queue->gsvs_ring_size && + !add_tess_rings && !add_sample_positions && + queue->initial_preamble_cs) { + *initial_preamble_cs = queue->initial_preamble_cs; + *continue_preamble_cs = queue->continue_preamble_cs; + if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) + *continue_preamble_cs = NULL; + return VK_SUCCESS; + } + + if (scratch_size > queue->scratch_size) { + scratch_bo = queue->device->ws->buffer_create(queue->device->ws, + scratch_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!scratch_bo) + goto fail; + } else + scratch_bo = queue->scratch_bo; + + if (compute_scratch_size > queue->compute_scratch_size) { + compute_scratch_bo = queue->device->ws->buffer_create(queue->device->ws, + compute_scratch_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!compute_scratch_bo) + goto fail; + + } else + compute_scratch_bo = queue->compute_scratch_bo; + + if (esgs_ring_size > queue->esgs_ring_size) { + esgs_ring_bo = queue->device->ws->buffer_create(queue->device->ws, + esgs_ring_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!esgs_ring_bo) + goto fail; + } else { + esgs_ring_bo = queue->esgs_ring_bo; + esgs_ring_size = queue->esgs_ring_size; + } + + if (gsvs_ring_size > queue->gsvs_ring_size) { + gsvs_ring_bo = queue->device->ws->buffer_create(queue->device->ws, + gsvs_ring_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!gsvs_ring_bo) + goto fail; + } else { + gsvs_ring_bo = queue->gsvs_ring_bo; + gsvs_ring_size = queue->gsvs_ring_size; + } + + if (add_tess_rings) { + tess_factor_ring_bo = queue->device->ws->buffer_create(queue->device->ws, + tess_factor_ring_size, + 256, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!tess_factor_ring_bo) + goto fail; + tess_offchip_ring_bo = queue->device->ws->buffer_create(queue->device->ws, + tess_offchip_ring_size, + 256, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!tess_offchip_ring_bo) + goto fail; + } else { + tess_factor_ring_bo = queue->tess_factor_ring_bo; + tess_offchip_ring_bo = queue->tess_offchip_ring_bo; + } + + if (scratch_bo != queue->scratch_bo || + esgs_ring_bo != queue->esgs_ring_bo || + gsvs_ring_bo != queue->gsvs_ring_bo || + tess_factor_ring_bo != queue->tess_factor_ring_bo || + tess_offchip_ring_bo != queue->tess_offchip_ring_bo || add_sample_positions) { + uint32_t size = 0; + if (gsvs_ring_bo || esgs_ring_bo || + tess_factor_ring_bo || tess_offchip_ring_bo || add_sample_positions) { + size = 112; /* 2 dword + 2 padding + 4 dword * 6 */ + if (add_sample_positions) + size += 256; /* 32+16+8+4+2+1 samples * 4 * 2 = 248 bytes. */ + } + else if (scratch_bo) + size = 8; /* 2 dword */ + + descriptor_bo = queue->device->ws->buffer_create(queue->device->ws, + size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_CPU_ACCESS); + if (!descriptor_bo) + goto fail; + } else + descriptor_bo = queue->descriptor_bo; + + for(int i = 0; i < 2; ++i) { + struct radeon_winsys_cs *cs = NULL; + cs = queue->device->ws->cs_create(queue->device->ws, + queue->queue_family_index ? RING_COMPUTE : RING_GFX); + if (!cs) + goto fail; + + dest_cs[i] = cs; + + if (scratch_bo) + queue->device->ws->cs_add_buffer(cs, scratch_bo, 8); + + if (esgs_ring_bo) + queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8); + + if (gsvs_ring_bo) + queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8); + + if (tess_factor_ring_bo) + queue->device->ws->cs_add_buffer(cs, tess_factor_ring_bo, 8); + + if (tess_offchip_ring_bo) + queue->device->ws->cs_add_buffer(cs, tess_offchip_ring_bo, 8); + + if (descriptor_bo) + queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8); + + if (descriptor_bo != queue->descriptor_bo) { + uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo); + + if (scratch_bo) { + uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo); + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1); + map[0] = scratch_va; + map[1] = rsrc1; + } + + if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo || + add_sample_positions) + fill_geom_tess_rings(queue, map, add_sample_positions, + esgs_ring_size, esgs_ring_bo, + gsvs_ring_size, gsvs_ring_bo, + tess_factor_ring_size, tess_factor_ring_bo, + tess_offchip_ring_size, tess_offchip_ring_bo); + + queue->device->ws->buffer_unmap(descriptor_bo); + } + + if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + } + + if (esgs_ring_bo || gsvs_ring_bo) { + if (queue->device->physical_device->rad_info.chip_class >= CIK) { + radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2); + radeon_emit(cs, esgs_ring_size >> 8); + radeon_emit(cs, gsvs_ring_size >> 8); + } else { + radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2); + radeon_emit(cs, esgs_ring_size >> 8); + radeon_emit(cs, gsvs_ring_size >> 8); + } + } + + if (tess_factor_ring_bo) { + uint64_t tf_va = queue->device->ws->buffer_get_va(tess_factor_ring_bo); + if (queue->device->physical_device->rad_info.chip_class >= CIK) { + radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE, + S_030938_SIZE(tess_factor_ring_size / 4)); + radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE, + tf_va >> 8); + radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, hs_offchip_param); + } else { + radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE, + S_008988_SIZE(tess_factor_ring_size / 4)); + radeon_set_config_reg(cs, R_0089B8_VGT_TF_MEMORY_BASE, + tf_va >> 8); + radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM, + hs_offchip_param); + } + } + + if (descriptor_bo) { + uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0, + R_00B130_SPI_SHADER_USER_DATA_VS_0, + R_00B230_SPI_SHADER_USER_DATA_GS_0, + R_00B330_SPI_SHADER_USER_DATA_ES_0, + R_00B430_SPI_SHADER_USER_DATA_HS_0, + R_00B530_SPI_SHADER_USER_DATA_LS_0}; + + uint64_t va = queue->device->ws->buffer_get_va(descriptor_bo); + + for (int i = 0; i < ARRAY_SIZE(regs); ++i) { + radeon_set_sh_reg_seq(cs, regs[i], 2); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + } + } + + if (compute_scratch_bo) { + uint64_t scratch_va = queue->device->ws->buffer_get_va(compute_scratch_bo); + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1); + + queue->device->ws->cs_add_buffer(cs, compute_scratch_bo, 8); + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2); + radeon_emit(cs, scratch_va); + radeon_emit(cs, rsrc1); + } + + if (!i) { + si_cs_emit_cache_flush(cs, + queue->device->physical_device->rad_info.chip_class, + queue->queue_family_index == RING_COMPUTE && + queue->device->physical_device->rad_info.chip_class >= CIK, + RADV_CMD_FLAG_INV_ICACHE | + RADV_CMD_FLAG_INV_SMEM_L1 | + RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_GLOBAL_L2); + } + + if (!queue->device->ws->cs_finalize(cs)) + goto fail; + } + + if (queue->initial_preamble_cs) + queue->device->ws->cs_destroy(queue->initial_preamble_cs); + + if (queue->continue_preamble_cs) + queue->device->ws->cs_destroy(queue->continue_preamble_cs); + + queue->initial_preamble_cs = dest_cs[0]; + queue->continue_preamble_cs = dest_cs[1]; + + if (scratch_bo != queue->scratch_bo) { + if (queue->scratch_bo) + queue->device->ws->buffer_destroy(queue->scratch_bo); + queue->scratch_bo = scratch_bo; + queue->scratch_size = scratch_size; + } + + if (compute_scratch_bo != queue->compute_scratch_bo) { + if (queue->compute_scratch_bo) + queue->device->ws->buffer_destroy(queue->compute_scratch_bo); + queue->compute_scratch_bo = compute_scratch_bo; + queue->compute_scratch_size = compute_scratch_size; + } + + if (esgs_ring_bo != queue->esgs_ring_bo) { + if (queue->esgs_ring_bo) + queue->device->ws->buffer_destroy(queue->esgs_ring_bo); + queue->esgs_ring_bo = esgs_ring_bo; + queue->esgs_ring_size = esgs_ring_size; + } + + if (gsvs_ring_bo != queue->gsvs_ring_bo) { + if (queue->gsvs_ring_bo) + queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); + queue->gsvs_ring_bo = gsvs_ring_bo; + queue->gsvs_ring_size = gsvs_ring_size; + } + + if (tess_factor_ring_bo != queue->tess_factor_ring_bo) { + queue->tess_factor_ring_bo = tess_factor_ring_bo; + } + + if (tess_offchip_ring_bo != queue->tess_offchip_ring_bo) { + queue->tess_offchip_ring_bo = tess_offchip_ring_bo; + queue->has_tess_rings = true; + } + + if (descriptor_bo != queue->descriptor_bo) { + if (queue->descriptor_bo) + queue->device->ws->buffer_destroy(queue->descriptor_bo); + + queue->descriptor_bo = descriptor_bo; + } + + if (add_sample_positions) + queue->has_sample_positions = true; + + *initial_preamble_cs = queue->initial_preamble_cs; + *continue_preamble_cs = queue->continue_preamble_cs; + if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) + *continue_preamble_cs = NULL; + return VK_SUCCESS; +fail: + for (int i = 0; i < ARRAY_SIZE(dest_cs); ++i) + if (dest_cs[i]) + queue->device->ws->cs_destroy(dest_cs[i]); + if (descriptor_bo && descriptor_bo != queue->descriptor_bo) + queue->device->ws->buffer_destroy(descriptor_bo); + if (scratch_bo && scratch_bo != queue->scratch_bo) + queue->device->ws->buffer_destroy(scratch_bo); + if (compute_scratch_bo && compute_scratch_bo != queue->compute_scratch_bo) + queue->device->ws->buffer_destroy(compute_scratch_bo); + if (esgs_ring_bo && esgs_ring_bo != queue->esgs_ring_bo) + queue->device->ws->buffer_destroy(esgs_ring_bo); + if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo) + queue->device->ws->buffer_destroy(gsvs_ring_bo); + if (tess_factor_ring_bo && tess_factor_ring_bo != queue->tess_factor_ring_bo) + queue->device->ws->buffer_destroy(tess_factor_ring_bo); + if (tess_offchip_ring_bo && tess_offchip_ring_bo != queue->tess_offchip_ring_bo) + queue->device->ws->buffer_destroy(tess_offchip_ring_bo); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; +} + VkResult radv_QueueSubmit( VkQueue _queue, uint32_t submitCount, @@ -924,39 +1865,93 @@ VkResult radv_QueueSubmit( struct radeon_winsys_ctx *ctx = queue->hw_ctx; int ret; uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX; + uint32_t scratch_size = 0; + uint32_t compute_scratch_size = 0; + uint32_t esgs_ring_size = 0, gsvs_ring_size = 0; + struct radeon_winsys_cs *initial_preamble_cs = NULL, *continue_preamble_cs = NULL; + VkResult result; + bool fence_emitted = false; + bool tess_rings_needed = false; + bool sample_positions_needed = false; + + /* Do this first so failing to allocate scratch buffers can't result in + * partially executed submissions. */ + for (uint32_t i = 0; i < submitCount; i++) { + for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, + pSubmits[i].pCommandBuffers[j]); + + scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed); + compute_scratch_size = MAX2(compute_scratch_size, + cmd_buffer->compute_scratch_size_needed); + esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed); + gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed); + tess_rings_needed |= cmd_buffer->tess_rings_needed; + sample_positions_needed |= cmd_buffer->sample_positions_needed; + } + } + + result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, + esgs_ring_size, gsvs_ring_size, tess_rings_needed, + sample_positions_needed, + &initial_preamble_cs, &continue_preamble_cs); + if (result != VK_SUCCESS) + return result; for (uint32_t i = 0; i < submitCount; i++) { struct radeon_winsys_cs **cs_array; - bool can_patch = true; + bool do_flush = !i || pSubmits[i].pWaitDstStageMask; + bool can_patch = !do_flush; uint32_t advance; - if (!pSubmits[i].commandBufferCount) + if (!pSubmits[i].commandBufferCount) { + if (pSubmits[i].waitSemaphoreCount || pSubmits[i].signalSemaphoreCount) { + ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, + &queue->device->empty_cs[queue->queue_family_index], + 1, NULL, NULL, + (struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores, + pSubmits[i].waitSemaphoreCount, + (struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores, + pSubmits[i].signalSemaphoreCount, + false, base_fence); + if (ret) { + radv_loge("failed to submit CS %d\n", i); + abort(); + } + fence_emitted = true; + } continue; + } cs_array = malloc(sizeof(struct radeon_winsys_cs *) * - pSubmits[i].commandBufferCount); + (pSubmits[i].commandBufferCount + do_flush)); + + if(do_flush) + cs_array[0] = pSubmits[i].waitSemaphoreCount ? + queue->device->flush_shader_cs[queue->queue_family_index] : + queue->device->flush_cs[queue->queue_family_index]; for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pSubmits[i].pCommandBuffers[j]); assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); - cs_array[j] = cmd_buffer->cs; + cs_array[j + do_flush] = cmd_buffer->cs; if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) can_patch = false; } - for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) { + for (uint32_t j = 0; j < pSubmits[i].commandBufferCount + do_flush; j += advance) { advance = MIN2(max_cs_submission, - pSubmits[i].commandBufferCount - j); + pSubmits[i].commandBufferCount + do_flush - j); bool b = j == 0; - bool e = j + advance == pSubmits[i].commandBufferCount; + bool e = j + advance == pSubmits[i].commandBufferCount + do_flush; if (queue->device->trace_bo) *queue->device->trace_id_ptr = 0; - ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array, - pSubmits[i].commandBufferCount, + ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, + advance, initial_preamble_cs, continue_preamble_cs, (struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores, b ? pSubmits[i].waitSemaphoreCount : 0, (struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores, @@ -967,6 +1962,7 @@ VkResult radv_QueueSubmit( radv_loge("failed to submit CS %d\n", i); abort(); } + fence_emitted = true; if (queue->device->trace_bo) { bool success = queue->device->ws->ctx_wait_idle( queue->hw_ctx, @@ -984,10 +1980,11 @@ VkResult radv_QueueSubmit( } if (fence) { - if (!submitCount) + if (!fence_emitted) ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, &queue->device->empty_cs[queue->queue_family_index], - 1, NULL, 0, NULL, 0, false, base_fence); + 1, NULL, NULL, NULL, 0, NULL, 0, + false, base_fence); fence->submitted = true; } @@ -1049,6 +2046,21 @@ PFN_vkVoidFunction radv_GetDeviceProcAddr( return radv_lookup_entrypoint(pName); } +bool radv_get_memory_fd(struct radv_device *device, + struct radv_device_memory *memory, + int *pFD) +{ + struct radeon_bo_metadata metadata; + + if (memory->image) { + radv_init_metadata(device, memory->image, &metadata); + device->ws->buffer_set_metadata(memory->bo, &metadata); + } + + return device->ws->buffer_get_fd(device->ws, memory->bo, + pFD); +} + VkResult radv_AllocateMemory( VkDevice _device, const VkMemoryAllocateInfo* pAllocateInfo, @@ -1060,6 +2072,7 @@ VkResult radv_AllocateMemory( VkResult result; enum radeon_bo_domain domain; uint32_t flags = 0; + assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO); if (pAllocateInfo->allocationSize == 0) { @@ -1068,11 +2081,35 @@ VkResult radv_AllocateMemory( return VK_SUCCESS; } + const VkImportMemoryFdInfoKHX *import_info = + vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHX); + const VkDedicatedAllocationMemoryAllocateInfoNV *dedicate_info = + vk_find_struct_const(pAllocateInfo->pNext, DEDICATED_ALLOCATION_MEMORY_ALLOCATE_INFO_NV); + mem = vk_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (mem == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + if (dedicate_info) { + mem->image = radv_image_from_handle(dedicate_info->image); + mem->buffer = radv_buffer_from_handle(dedicate_info->buffer); + } else { + mem->image = NULL; + mem->buffer = NULL; + } + + if (import_info) { + assert(import_info->handleType == + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHX); + mem->bo = device->ws->buffer_from_fd(device->ws, import_info->fd, + NULL, NULL); + if (!mem->bo) + goto fail; + else + goto out_success; + } + uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096); if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_WRITE_COMBINE || pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_CACHED) @@ -1088,7 +2125,7 @@ VkResult radv_AllocateMemory( if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_WRITE_COMBINE) flags |= RADEON_FLAG_GTT_WC; - mem->bo = device->ws->buffer_create(device->ws, alloc_size, 32768, + mem->bo = device->ws->buffer_create(device->ws, alloc_size, 65536, domain, flags); if (!mem->bo) { @@ -1096,7 +2133,7 @@ VkResult radv_AllocateMemory( goto fail; } mem->type_index = pAllocateInfo->memoryTypeIndex; - +out_success: *pMem = radv_device_memory_to_handle(mem); return VK_SUCCESS; @@ -1187,8 +2224,12 @@ void radv_GetBufferMemoryRequirements( pMemoryRequirements->memoryTypeBits = (1u << RADV_MEM_TYPE_COUNT) - 1; - pMemoryRequirements->size = buffer->size; - pMemoryRequirements->alignment = 16; + if (buffer->flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT) + pMemoryRequirements->alignment = 4096; + else + pMemoryRequirements->alignment = 16; + + pMemoryRequirements->size = align64(buffer->size, pMemoryRequirements->alignment); } void radv_GetImageMemoryRequirements( @@ -1261,13 +2302,89 @@ VkResult radv_BindImageMemory( return VK_SUCCESS; } -VkResult radv_QueueBindSparse( - VkQueue queue, + +static void +radv_sparse_buffer_bind_memory(struct radv_device *device, + const VkSparseBufferMemoryBindInfo *bind) +{ + RADV_FROM_HANDLE(radv_buffer, buffer, bind->buffer); + + for (uint32_t i = 0; i < bind->bindCount; ++i) { + struct radv_device_memory *mem = NULL; + + if (bind->pBinds[i].memory != VK_NULL_HANDLE) + mem = radv_device_memory_from_handle(bind->pBinds[i].memory); + + device->ws->buffer_virtual_bind(buffer->bo, + bind->pBinds[i].resourceOffset, + bind->pBinds[i].size, + mem ? mem->bo : NULL, + bind->pBinds[i].memoryOffset); + } +} + +static void +radv_sparse_image_opaque_bind_memory(struct radv_device *device, + const VkSparseImageOpaqueMemoryBindInfo *bind) +{ + RADV_FROM_HANDLE(radv_image, image, bind->image); + + for (uint32_t i = 0; i < bind->bindCount; ++i) { + struct radv_device_memory *mem = NULL; + + if (bind->pBinds[i].memory != VK_NULL_HANDLE) + mem = radv_device_memory_from_handle(bind->pBinds[i].memory); + + device->ws->buffer_virtual_bind(image->bo, + bind->pBinds[i].resourceOffset, + bind->pBinds[i].size, + mem ? mem->bo : NULL, + bind->pBinds[i].memoryOffset); + } +} + + VkResult radv_QueueBindSparse( + VkQueue _queue, uint32_t bindInfoCount, const VkBindSparseInfo* pBindInfo, - VkFence fence) + VkFence _fence) { - stub_return(VK_ERROR_INCOMPATIBLE_DRIVER); + RADV_FROM_HANDLE(radv_fence, fence, _fence); + RADV_FROM_HANDLE(radv_queue, queue, _queue); + struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL; + bool fence_emitted = false; + + for (uint32_t i = 0; i < bindInfoCount; ++i) { + for (uint32_t j = 0; j < pBindInfo[i].bufferBindCount; ++j) { + radv_sparse_buffer_bind_memory(queue->device, + pBindInfo[i].pBufferBinds + j); + } + + for (uint32_t j = 0; j < pBindInfo[i].imageOpaqueBindCount; ++j) { + radv_sparse_image_opaque_bind_memory(queue->device, + pBindInfo[i].pImageOpaqueBinds + j); + } + + if (pBindInfo[i].waitSemaphoreCount || pBindInfo[i].signalSemaphoreCount) { + queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx, + &queue->device->empty_cs[queue->queue_family_index], + 1, NULL, NULL, + (struct radeon_winsys_sem **)pBindInfo[i].pWaitSemaphores, + pBindInfo[i].waitSemaphoreCount, + (struct radeon_winsys_sem **)pBindInfo[i].pSignalSemaphores, + pBindInfo[i].signalSemaphoreCount, + false, base_fence); + fence_emitted = true; + if (fence) + fence->submitted = true; + } + } + + if (fence && !fence_emitted) { + fence->signalled = true; + } + + return VK_SUCCESS; } VkResult radv_CreateFence( @@ -1403,7 +2520,7 @@ VkResult radv_CreateSemaphore( if (!sem) return VK_ERROR_OUT_OF_HOST_MEMORY; - *pSemaphore = (VkSemaphore)sem; + *pSemaphore = radeon_winsys_sem_to_handle(sem); return VK_SUCCESS; } @@ -1413,11 +2530,10 @@ void radv_DestroySemaphore( const VkAllocationCallbacks* pAllocator) { RADV_FROM_HANDLE(radv_device, device, _device); - struct radeon_winsys_sem *sem; + RADV_FROM_HANDLE(radeon_winsys_sem, sem, _semaphore); if (!_semaphore) return; - sem = (struct radeon_winsys_sem *)_semaphore; device->ws->destroy_sem(sem); } @@ -1515,6 +2631,17 @@ VkResult radv_CreateBuffer( buffer->usage = pCreateInfo->usage; buffer->bo = NULL; buffer->offset = 0; + buffer->flags = pCreateInfo->flags; + + if (pCreateInfo->flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT) { + buffer->bo = device->ws->buffer_create(device->ws, + align64(buffer->size, 4096), + 4096, 0, RADEON_FLAG_VIRTUAL); + if (!buffer->bo) { + vk_free2(&device->alloc, pAllocator, buffer); + return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + } *pBuffer = radv_buffer_to_handle(buffer); @@ -1532,6 +2659,9 @@ void radv_DestroyBuffer( if (!buffer) return; + if (buffer->flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT) + device->ws->buffer_destroy(buffer->bo); + vk_free2(&device->alloc, pAllocator, buffer); } @@ -1539,9 +2669,14 @@ static inline unsigned si_tile_mode_index(const struct radv_image *image, unsigned level, bool stencil) { if (stencil) - return image->surface.stencil_tiling_index[level]; + return image->surface.u.legacy.stencil_tiling_index[level]; else - return image->surface.tiling_index[level]; + return image->surface.u.legacy.tiling_index[level]; +} + +static uint32_t radv_surface_layer_count(struct radv_image_view *iview) +{ + return iview->type == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth : iview->layer_count; } static void @@ -1555,7 +2690,7 @@ radv_initialise_color_surface(struct radv_device *device, unsigned pitch_tile_max, slice_tile_max, tile_mode_index; uint64_t va; const struct radeon_surf *surf = &iview->image->surface; - const struct radeon_surf_level *level_info = &surf->level[iview->base_mip]; + const struct legacy_surf_level *level_info = &surf->u.legacy.level[iview->base_mip]; desc = vk_format_description(iview->vk_format); @@ -1575,8 +2710,9 @@ radv_initialise_color_surface(struct radv_device *device, va += iview->image->dcc_offset; cb->cb_dcc_base = va >> 8; + uint32_t max_slice = radv_surface_layer_count(iview); cb->cb_color_view = S_028C6C_SLICE_START(iview->base_layer) | - S_028C6C_SLICE_MAX(iview->base_layer + iview->extent.depth - 1); + S_028C6C_SLICE_MAX(iview->base_layer + max_slice - 1); cb->micro_tile_mode = iview->image->surface.micro_tile_mode; pitch_tile_max = level_info->nblk_x / 8 - 1; @@ -1590,8 +2726,8 @@ radv_initialise_color_surface(struct radv_device *device, cb->cb_color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == VK_SWIZZLE_1) | S_028C74_TILE_MODE_INDEX(tile_mode_index); - if (iview->image->samples > 1) { - unsigned log_samples = util_logbase2(iview->image->samples); + if (iview->image->info.samples > 1) { + unsigned log_samples = util_logbase2(iview->image->info.samples); cb->cb_color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | S_028C74_NUM_FRAGMENTS(log_samples); @@ -1599,14 +2735,14 @@ radv_initialise_color_surface(struct radv_device *device, if (iview->image->fmask.size) { va = device->ws->buffer_get_va(iview->bo) + iview->image->offset + iview->image->fmask.offset; - if (device->instance->physicalDevice.rad_info.chip_class >= CIK) + if (device->physical_device->rad_info.chip_class >= CIK) cb->cb_color_pitch |= S_028C64_FMASK_TILE_MAX(iview->image->fmask.pitch_in_pixels / 8 - 1); cb->cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(iview->image->fmask.tile_mode_index); cb->cb_color_fmask = va >> 8; cb->cb_color_fmask_slice = S_028C88_TILE_MAX(iview->image->fmask.slice_tile_max); } else { /* This must be set for fast clear to work without FMASK. */ - if (device->instance->physicalDevice.rad_info.chip_class >= CIK) + if (device->physical_device->rad_info.chip_class >= CIK) cb->cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max); cb->cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index); cb->cb_color_fmask = cb->cb_color_base; @@ -1655,20 +2791,20 @@ radv_initialise_color_surface(struct radv_device *device, format != V_028C70_COLOR_24_8) | S_028C70_NUMBER_TYPE(ntype) | S_028C70_ENDIAN(endian); - if (iview->image->samples > 1) + if (iview->image->info.samples > 1) if (iview->image->fmask.size) cb->cb_color_info |= S_028C70_COMPRESSION(1); if (iview->image->cmask.size && - (device->debug_flags & RADV_DEBUG_FAST_CLEARS)) + !(device->debug_flags & RADV_DEBUG_NO_FAST_CLEARS)) cb->cb_color_info |= S_028C70_FAST_CLEAR(1); - if (iview->image->surface.dcc_size && level_info->dcc_enabled) + if (iview->image->surface.dcc_size && iview->base_mip < surf->num_dcc_levels) cb->cb_color_info |= S_028C70_DCC_ENABLE(1); - if (device->instance->physicalDevice.rad_info.chip_class >= VI) { + if (device->physical_device->rad_info.chip_class >= VI) { unsigned max_uncompressed_block_size = 2; - if (iview->image->samples > 1) { + if (iview->image->info.samples > 1) { if (iview->image->surface.bpe == 1) max_uncompressed_block_size = 0; else if (iview->image->surface.bpe == 2) @@ -1681,8 +2817,8 @@ radv_initialise_color_surface(struct radv_device *device, /* This must be set for fast clear to work without FMASK. */ if (!iview->image->fmask.size && - device->instance->physicalDevice.rad_info.chip_class == SI) { - unsigned bankh = util_logbase2(iview->image->surface.bankh); + device->physical_device->rad_info.chip_class == SI) { + unsigned bankh = util_logbase2(iview->image->surface.u.legacy.bankh); cb->cb_color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh); } } @@ -1693,9 +2829,9 @@ radv_initialise_ds_surface(struct radv_device *device, struct radv_image_view *iview) { unsigned level = iview->base_mip; - unsigned format; + unsigned format, stencil_format; uint64_t va, s_offs, z_offs; - const struct radeon_surf_level *level_info = &iview->image->surface.level[level]; + bool stencil_only = false; memset(ds, 0, sizeof(*ds)); switch (iview->vk_format) { case VK_FORMAT_D24_UNORM_S8_UINT: @@ -1714,95 +2850,91 @@ radv_initialise_ds_surface(struct radv_device *device, S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1); ds->offset_scale = 1.0f; break; + case VK_FORMAT_S8_UINT: + stencil_only = true; + break; default: break; } format = radv_translate_dbformat(iview->vk_format); - if (format == V_028040_Z_INVALID) { - fprintf(stderr, "Invalid DB format: %d, disabling DB.\n", iview->vk_format); - } + stencil_format = iview->image->surface.flags & RADEON_SURF_SBUFFER ? + V_028044_STENCIL_8 : V_028044_STENCIL_INVALID; + + uint32_t max_slice = radv_surface_layer_count(iview); + ds->db_depth_view = S_028008_SLICE_START(iview->base_layer) | + S_028008_SLICE_MAX(iview->base_layer + max_slice - 1); + + ds->db_htile_data_base = 0; + ds->db_htile_surface = 0; va = device->ws->buffer_get_va(iview->bo) + iview->image->offset; s_offs = z_offs = va; - z_offs += iview->image->surface.level[level].offset; - s_offs += iview->image->surface.stencil_level[level].offset; - ds->db_depth_view = S_028008_SLICE_START(iview->base_layer) | - S_028008_SLICE_MAX(iview->base_layer + iview->extent.depth - 1); - ds->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1); - ds->db_z_info = S_028040_FORMAT(format) | S_028040_ZRANGE_PRECISION(1); + { + const struct legacy_surf_level *level_info = &iview->image->surface.u.legacy.level[level]; + + if (stencil_only) + level_info = &iview->image->surface.u.legacy.stencil_level[level]; + + z_offs += iview->image->surface.u.legacy.level[level].offset; + s_offs += iview->image->surface.u.legacy.stencil_level[level].offset; + + ds->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1); + ds->db_z_info = S_028040_FORMAT(format) | S_028040_ZRANGE_PRECISION(1); + ds->db_stencil_info = S_028044_FORMAT(stencil_format); + + if (iview->image->info.samples > 1) + ds->db_z_info |= S_028040_NUM_SAMPLES(util_logbase2(iview->image->info.samples)); + + if (device->physical_device->rad_info.chip_class >= CIK) { + struct radeon_info *info = &device->physical_device->rad_info; + unsigned tiling_index = iview->image->surface.u.legacy.tiling_index[level]; + unsigned stencil_index = iview->image->surface.u.legacy.stencil_tiling_index[level]; + unsigned macro_index = iview->image->surface.u.legacy.macro_tile_index; + unsigned tile_mode = info->si_tile_mode_array[tiling_index]; + unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index]; + unsigned macro_mode = info->cik_macrotile_mode_array[macro_index]; + + if (stencil_only) + tile_mode = stencil_tile_mode; + + ds->db_depth_info |= + S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) | + S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) | + S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) | + S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) | + S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) | + S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode)); + ds->db_z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode)); + ds->db_stencil_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode)); + } else { + unsigned tile_mode_index = si_tile_mode_index(iview->image, level, false); + ds->db_z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index); + tile_mode_index = si_tile_mode_index(iview->image, level, true); + ds->db_stencil_info |= S_028044_TILE_MODE_INDEX(tile_mode_index); + } - if (iview->image->samples > 1) - ds->db_z_info |= S_028040_NUM_SAMPLES(util_logbase2(iview->image->samples)); + ds->db_depth_size = S_028058_PITCH_TILE_MAX((level_info->nblk_x / 8) - 1) | + S_028058_HEIGHT_TILE_MAX((level_info->nblk_y / 8) - 1); + ds->db_depth_slice = S_02805C_SLICE_TILE_MAX((level_info->nblk_x * level_info->nblk_y) / 64 - 1); - if (iview->image->surface.flags & RADEON_SURF_SBUFFER) - ds->db_stencil_info = S_028044_FORMAT(V_028044_STENCIL_8); - else - ds->db_stencil_info = S_028044_FORMAT(V_028044_STENCIL_INVALID); - - if (device->instance->physicalDevice.rad_info.chip_class >= CIK) { - struct radeon_info *info = &device->instance->physicalDevice.rad_info; - unsigned tiling_index = iview->image->surface.tiling_index[level]; - unsigned stencil_index = iview->image->surface.stencil_tiling_index[level]; - unsigned macro_index = iview->image->surface.macro_tile_index; - unsigned tile_mode = info->si_tile_mode_array[tiling_index]; - unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index]; - unsigned macro_mode = info->cik_macrotile_mode_array[macro_index]; - - ds->db_depth_info |= - S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) | - S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) | - S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) | - S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) | - S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) | - S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode)); - ds->db_z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode)); - ds->db_stencil_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode)); - } else { - unsigned tile_mode_index = si_tile_mode_index(iview->image, level, false); - ds->db_z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index); - tile_mode_index = si_tile_mode_index(iview->image, level, true); - ds->db_stencil_info |= S_028044_TILE_MODE_INDEX(tile_mode_index); - } - - if (iview->image->htile.size && !level) { - ds->db_z_info |= S_028040_TILE_SURFACE_ENABLE(1) | - S_028040_ALLOW_EXPCLEAR(1); - - if (iview->image->surface.flags & RADEON_SURF_SBUFFER) { - /* Workaround: For a not yet understood reason, the - * combination of MSAA, fast stencil clear and stencil - * decompress messes with subsequent stencil buffer - * uses. Problem was reproduced on Verde, Bonaire, - * Tonga, and Carrizo. - * - * Disabling EXPCLEAR works around the problem. - * - * Check piglit's arb_texture_multisample-stencil-clear - * test if you want to try changing this. - */ - if (iview->image->samples <= 1) - ds->db_stencil_info |= S_028044_ALLOW_EXPCLEAR(1); - } else - /* Use all of the htile_buffer for depth if there's no stencil. */ - ds->db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1); + if (iview->image->surface.htile_size && !level) { + ds->db_z_info |= S_028040_TILE_SURFACE_ENABLE(1); - va = device->ws->buffer_get_va(iview->bo) + iview->image->offset + - iview->image->htile.offset; - ds->db_htile_data_base = va >> 8; - ds->db_htile_surface = S_028ABC_FULL_CACHE(1); - } else { - ds->db_htile_data_base = 0; - ds->db_htile_surface = 0; + if (!(iview->image->surface.flags & RADEON_SURF_SBUFFER)) + /* Use all of the htile_buffer for depth if there's no stencil. */ + ds->db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1); + + va = device->ws->buffer_get_va(iview->bo) + iview->image->offset + + iview->image->htile_offset; + ds->db_htile_data_base = va >> 8; + ds->db_htile_surface = S_028ABC_FULL_CACHE(1); + } } ds->db_z_read_base = ds->db_z_write_base = z_offs >> 8; ds->db_stencil_read_base = ds->db_stencil_write_base = s_offs >> 8; - - ds->db_depth_size = S_028058_PITCH_TILE_MAX((level_info->nblk_x / 8) - 1) | - S_028058_HEIGHT_TILE_MAX((level_info->nblk_y / 8) - 1); - ds->db_depth_slice = S_02805C_SLICE_TILE_MAX((level_info->nblk_x * level_info->nblk_y) / 64 - 1); } VkResult radv_CreateFramebuffer( @@ -1824,6 +2956,9 @@ VkResult radv_CreateFramebuffer( return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); framebuffer->attachment_count = pCreateInfo->attachmentCount; + framebuffer->width = pCreateInfo->width; + framebuffer->height = pCreateInfo->height; + framebuffer->layers = pCreateInfo->layers; for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) { VkImageView _iview = pCreateInfo->pAttachments[i]; struct radv_image_view *iview = radv_image_view_from_handle(_iview); @@ -1833,12 +2968,11 @@ VkResult radv_CreateFramebuffer( } else if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { radv_initialise_ds_surface(device, &framebuffer->attachments[i].ds, iview); } + framebuffer->width = MIN2(framebuffer->width, iview->extent.width); + framebuffer->height = MIN2(framebuffer->height, iview->extent.height); + framebuffer->layers = MIN2(framebuffer->layers, radv_surface_layer_count(iview)); } - framebuffer->width = pCreateInfo->width; - framebuffer->height = pCreateInfo->height; - framebuffer->layers = pCreateInfo->layers; - *pFramebuffer = radv_framebuffer_to_handle(framebuffer); return VK_SUCCESS; } @@ -1972,7 +3106,7 @@ radv_init_sampler(struct radv_device *device, uint32_t max_aniso = pCreateInfo->anisotropyEnable && pCreateInfo->maxAnisotropy > 1.0 ? (uint32_t) pCreateInfo->maxAnisotropy : 0; uint32_t max_aniso_ratio = radv_tex_aniso_filter(max_aniso); - bool is_vi = (device->instance->physicalDevice.rad_info.chip_class >= VI); + bool is_vi = (device->physical_device->rad_info.chip_class >= VI); sampler->state[0] = (S_008F30_CLAMP_X(radv_tex_wrap(pCreateInfo->addressModeU)) | S_008F30_CLAMP_Y(radv_tex_wrap(pCreateInfo->addressModeV)) | @@ -1991,7 +3125,7 @@ radv_init_sampler(struct radv_device *device, S_008F38_XY_MAG_FILTER(radv_tex_filter(pCreateInfo->magFilter, max_aniso)) | S_008F38_XY_MIN_FILTER(radv_tex_filter(pCreateInfo->minFilter, max_aniso)) | S_008F38_MIP_FILTER(radv_tex_mipfilter(pCreateInfo->mipmapMode)) | - S_008F38_MIP_POINT_PRECLAMP(1) | + S_008F38_MIP_POINT_PRECLAMP(0) | S_008F38_DISABLE_LSB_CEIL(1) | S_008F38_FILTER_PREC_FIX(1) | S_008F38_ANISO_OVERRIDE(is_vi)); @@ -2034,7 +3168,6 @@ void radv_DestroySampler( vk_free2(&device->alloc, pAllocator, sampler); } - /* vk_icd.h does not declare this function, so we declare it here to * suppress Wmissing-prototypes. */ @@ -2078,3 +3211,34 @@ vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t *pSupportedVersion) *pSupportedVersion = MIN2(*pSupportedVersion, 3u); return VK_SUCCESS; } + +VkResult radv_GetMemoryFdKHX(VkDevice _device, + VkDeviceMemory _memory, + VkExternalMemoryHandleTypeFlagsKHX handleType, + int *pFD) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_device_memory, memory, _memory); + + /* We support only one handle type. */ + assert(handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHX); + + bool ret = radv_get_memory_fd(device, memory, pFD); + if (ret == false) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + return VK_SUCCESS; +} + +VkResult radv_GetMemoryFdPropertiesKHX(VkDevice _device, + VkExternalMemoryHandleTypeFlagBitsKHX handleType, + int fd, + VkMemoryFdPropertiesKHX *pMemoryFdProperties) +{ + /* The valid usage section for this function says: + * + * "handleType must not be one of the handle types defined as opaque." + * + * Since we only handle opaque handles for now, there are no FD properties. + */ + return VK_ERROR_INVALID_EXTERNAL_HANDLE_KHX; +}