radv: Set driver version to mesa version;
[mesa.git] / src / amd / vulkan / radv_device.c
index da67b65ef43260d2ff70e9e7149f6a9c108b008f..d1fd58d77b9081bfbb9e5f8173229b99ce316b73 100644 (file)
  * IN THE SOFTWARE.
  */
 
-#include <dlfcn.h>
 #include <stdbool.h>
 #include <string.h>
 #include <unistd.h>
 #include <fcntl.h>
-#include <sys/stat.h>
 #include "radv_private.h"
 #include "radv_cs.h"
+#include "util/disk_cache.h"
 #include "util/strtod.h"
-
+#include "util/vk_util.h"
 #include <xf86drm.h>
 #include <amdgpu.h>
 #include <amdgpu_drm.h>
 #include "vk_format.h"
 #include "sid.h"
 #include "util/debug.h"
-struct radv_dispatch_table dtable;
-
-static int
-radv_get_function_timestamp(void *ptr, uint32_t* timestamp)
-{
-       Dl_info info;
-       struct stat st;
-       if (!dladdr(ptr, &info) || !info.dli_fname) {
-               return -1;
-       }
-       if (stat(info.dli_fname, &st)) {
-               return -1;
-       }
-       *timestamp = st.st_mtim.tv_sec;
-       return 0;
-}
 
 static int
 radv_device_get_cache_uuid(enum radeon_family family, void *uuid)
@@ -67,8 +50,8 @@ radv_device_get_cache_uuid(enum radeon_family family, void *uuid)
        uint32_t mesa_timestamp, llvm_timestamp;
        uint16_t f = family;
        memset(uuid, 0, VK_UUID_SIZE);
-       if (radv_get_function_timestamp(radv_device_get_cache_uuid, &mesa_timestamp) ||
-           radv_get_function_timestamp(LLVMInitializeAMDGPUTargetInfo, &llvm_timestamp))
+       if (!disk_cache_get_function_timestamp(radv_device_get_cache_uuid, &mesa_timestamp) ||
+           !disk_cache_get_function_timestamp(LLVMInitializeAMDGPUTargetInfo, &llvm_timestamp))
                return -1;
 
        memcpy(uuid, &mesa_timestamp, 4);
@@ -124,6 +107,14 @@ static const VkExtensionProperties common_device_extensions[] = {
                .extensionName = VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME,
                .specVersion = 1,
        },
+       {
+               .extensionName = VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME,
+               .specVersion = 1,
+       },
+       {
+               .extensionName = VK_NV_DEDICATED_ALLOCATION_EXTENSION_NAME,
+               .specVersion = 1,
+       },
 };
 
 static VkResult
@@ -216,11 +207,13 @@ radv_physical_device_init(struct radv_physical_device *device,
        assert(strlen(path) < ARRAY_SIZE(device->path));
        strncpy(device->path, path, ARRAY_SIZE(device->path));
 
-       device->ws = radv_amdgpu_winsys_create(fd);
+       device->ws = radv_amdgpu_winsys_create(fd, instance->debug_flags);
        if (!device->ws) {
                result = VK_ERROR_INCOMPATIBLE_DRIVER;
                goto fail;
        }
+
+       device->local_fd = fd;
        device->ws->query_info(device->ws, &device->rad_info);
        result = radv_init_wsi(device);
        if (result != VK_SUCCESS) {
@@ -245,7 +238,7 @@ radv_physical_device_init(struct radv_physical_device *device,
 
        fprintf(stderr, "WARNING: radv is not a conformant vulkan implementation, testing use only.\n");
        device->name = device->rad_info.name;
-       close(fd);
+
        return VK_SUCCESS;
 
 fail:
@@ -259,6 +252,7 @@ radv_physical_device_finish(struct radv_physical_device *device)
        radv_extensions_finish(device->instance, &device->extensions);
        radv_finish_wsi(device);
        device->ws->destroy(device->ws);
+       close(device->local_fd);
 }
 
 
@@ -290,7 +284,7 @@ static const VkAllocationCallbacks default_alloc = {
 };
 
 static const struct debug_control radv_debug_options[] = {
-       {"fastclears", RADV_DEBUG_FAST_CLEARS},
+       {"nofastclears", RADV_DEBUG_NO_FAST_CLEARS},
        {"nodcc", RADV_DEBUG_NO_DCC},
        {"shaders", RADV_DEBUG_DUMP_SHADERS},
        {"nocache", RADV_DEBUG_NO_CACHE},
@@ -298,6 +292,8 @@ static const struct debug_control radv_debug_options[] = {
        {"nohiz", RADV_DEBUG_NO_HIZ},
        {"nocompute", RADV_DEBUG_NO_COMPUTE_QUEUE},
        {"unsafemath", RADV_DEBUG_UNSAFE_MATH},
+       {"allbos", RADV_DEBUG_ALL_BOS},
+       {"noibs", RADV_DEBUG_NO_IBS},
        {NULL, 0}
 };
 
@@ -369,6 +365,9 @@ void radv_DestroyInstance(
 {
        RADV_FROM_HANDLE(radv_instance, instance, _instance);
 
+       if (!instance)
+               return;
+
        for (int i = 0; i < instance->physicalDeviceCount; ++i) {
                radv_physical_device_finish(instance->physicalDevices + i);
        }
@@ -459,14 +458,13 @@ void radv_GetPhysicalDeviceFeatures(
                .shaderSampledImageArrayDynamicIndexing   = true,
                .shaderStorageBufferArrayDynamicIndexing  = true,
                .shaderStorageImageArrayDynamicIndexing   = true,
-               .shaderStorageImageReadWithoutFormat      = false,
-               .shaderStorageImageWriteWithoutFormat     = false,
+               .shaderStorageImageReadWithoutFormat      = true,
+               .shaderStorageImageWriteWithoutFormat     = true,
                .shaderClipDistance                       = true,
                .shaderCullDistance                       = true,
-               .shaderFloat64                            = false,
+               .shaderFloat64                            = true,
                .shaderInt64                              = false,
                .shaderInt16                              = false,
-               .alphaToOne                               = true,
                .variableMultisampleRate                  = false,
                .inheritedQueries                         = false,
        };
@@ -479,6 +477,28 @@ void radv_GetPhysicalDeviceFeatures2KHR(
        return radv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
 }
 
+static uint32_t radv_get_driver_version()
+{
+       const char *minor_string = strchr(VERSION, '.');
+       const char *patch_string = minor_string ? strchr(minor_string + 1, ','): NULL;
+       int major = atoi(VERSION);
+       int minor = minor_string ? atoi(minor_string + 1) : 0;
+       int patch = patch_string ? atoi(patch_string + 1) : 0;
+       if (strstr(VERSION, "devel")) {
+               if (patch == 0) {
+                       patch = 99;
+                       if (minor == 0) {
+                               minor = 99;
+                               --major;
+                       } else
+                               --minor;
+               } else
+                       --patch;
+       }
+       uint32_t version = VK_MAKE_VERSION(major, minor, patch);
+       return version;
+}
+
 void radv_GetPhysicalDeviceProperties(
        VkPhysicalDevice                            physicalDevice,
        VkPhysicalDeviceProperties*                 pProperties)
@@ -599,8 +619,8 @@ void radv_GetPhysicalDeviceProperties(
        };
 
        *pProperties = (VkPhysicalDeviceProperties) {
-               .apiVersion = VK_MAKE_VERSION(1, 0, 5),
-               .driverVersion = 1,
+               .apiVersion = VK_MAKE_VERSION(1, 0, 42),
+               .driverVersion = radv_get_driver_version(),
                .vendorID = 0x1002,
                .deviceID = pdevice->rad_info.pci_id,
                .deviceType = VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU,
@@ -619,12 +639,11 @@ void radv_GetPhysicalDeviceProperties2KHR(
        return radv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties);
 }
 
-void radv_GetPhysicalDeviceQueueFamilyProperties(
-       VkPhysicalDevice                            physicalDevice,
+static void radv_get_physical_device_queue_family_properties(
+       struct radv_physical_device*                pdevice,
        uint32_t*                                   pCount,
-       VkQueueFamilyProperties*                    pQueueFamilyProperties)
+       VkQueueFamilyProperties**                    pQueueFamilyProperties)
 {
-       RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
        int num_queue_families = 1;
        int idx;
        if (pdevice->rad_info.compute_rings > 0 &&
@@ -642,7 +661,7 @@ void radv_GetPhysicalDeviceQueueFamilyProperties(
 
        idx = 0;
        if (*pCount >= 1) {
-               pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) {
+               *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) {
                        .queueFlags = VK_QUEUE_GRAPHICS_BIT |
                        VK_QUEUE_COMPUTE_BIT |
                        VK_QUEUE_TRANSFER_BIT,
@@ -657,7 +676,7 @@ void radv_GetPhysicalDeviceQueueFamilyProperties(
            pdevice->rad_info.chip_class >= CIK &&
            !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) {
                if (*pCount > idx) {
-                       pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) {
+                       *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) {
                                .queueFlags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT,
                                .queueCount = pdevice->rad_info.compute_rings,
                                .timestampValidBits = 64,
@@ -669,14 +688,42 @@ void radv_GetPhysicalDeviceQueueFamilyProperties(
        *pCount = idx;
 }
 
+void radv_GetPhysicalDeviceQueueFamilyProperties(
+       VkPhysicalDevice                            physicalDevice,
+       uint32_t*                                   pCount,
+       VkQueueFamilyProperties*                    pQueueFamilyProperties)
+{
+       RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
+       if (!pQueueFamilyProperties) {
+               return radv_get_physical_device_queue_family_properties(pdevice, pCount, NULL);
+               return;
+       }
+       VkQueueFamilyProperties *properties[] = {
+               pQueueFamilyProperties + 0,
+               pQueueFamilyProperties + 1,
+               pQueueFamilyProperties + 2,
+       };
+       radv_get_physical_device_queue_family_properties(pdevice, pCount, properties);
+       assert(*pCount <= 3);
+}
+
 void radv_GetPhysicalDeviceQueueFamilyProperties2KHR(
        VkPhysicalDevice                            physicalDevice,
        uint32_t*                                   pCount,
        VkQueueFamilyProperties2KHR                *pQueueFamilyProperties)
 {
-       return radv_GetPhysicalDeviceQueueFamilyProperties(physicalDevice,
-                                                          pCount,
-                                                          &pQueueFamilyProperties->queueFamilyProperties);
+       RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
+       if (!pQueueFamilyProperties) {
+               return radv_get_physical_device_queue_family_properties(pdevice, pCount, NULL);
+               return;
+       }
+       VkQueueFamilyProperties *properties[] = {
+               &pQueueFamilyProperties[0].queueFamilyProperties,
+               &pQueueFamilyProperties[1].queueFamilyProperties,
+               &pQueueFamilyProperties[2].queueFamilyProperties,
+       };
+       radv_get_physical_device_queue_family_properties(pdevice, pCount, properties);
+       assert(*pCount <= 3);
 }
 
 void radv_GetPhysicalDeviceMemoryProperties(
@@ -758,8 +805,10 @@ radv_queue_finish(struct radv_queue *queue)
        if (queue->hw_ctx)
                queue->device->ws->ctx_destroy(queue->hw_ctx);
 
-       if (queue->preamble_cs)
-               queue->device->ws->cs_destroy(queue->preamble_cs);
+       if (queue->initial_preamble_cs)
+               queue->device->ws->cs_destroy(queue->initial_preamble_cs);
+       if (queue->continue_preamble_cs)
+               queue->device->ws->cs_destroy(queue->continue_preamble_cs);
        if (queue->descriptor_bo)
                queue->device->ws->buffer_destroy(queue->descriptor_bo);
        if (queue->scratch_bo)
@@ -905,6 +954,21 @@ VkResult radv_CreateDevice(
                        break;
                }
                device->ws->cs_finalize(device->empty_cs[family]);
+
+               device->flush_cs[family] = device->ws->cs_create(device->ws, family);
+               switch (family) {
+               case RADV_QUEUE_GENERAL:
+               case RADV_QUEUE_COMPUTE:
+                       si_cs_emit_cache_flush(device->flush_cs[family],
+                                              device->physical_device->rad_info.chip_class,
+                                              family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
+                                              RADV_CMD_FLAG_INV_ICACHE |
+                                              RADV_CMD_FLAG_INV_SMEM_L1 |
+                                              RADV_CMD_FLAG_INV_VMEM_L1 |
+                                              RADV_CMD_FLAG_INV_GLOBAL_L2);
+                       break;
+               }
+               device->ws->cs_finalize(device->flush_cs[family]);
        }
 
        if (getenv("RADV_TRACE_FILE")) {
@@ -918,6 +982,9 @@ VkResult radv_CreateDevice(
                        goto fail;
        }
 
+       if (device->physical_device->rad_info.chip_class >= CIK)
+               cik_create_gfx_config(device);
+
        *pDevice = radv_device_to_handle(device);
        return VK_SUCCESS;
 
@@ -925,6 +992,9 @@ fail:
        if (device->trace_bo)
                device->ws->buffer_destroy(device->trace_bo);
 
+       if (device->gfx_init)
+               device->ws->buffer_destroy(device->gfx_init);
+
        for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
                for (unsigned q = 0; q < device->queue_count[i]; q++)
                        radv_queue_finish(&device->queues[i][q]);
@@ -942,14 +1012,24 @@ void radv_DestroyDevice(
 {
        RADV_FROM_HANDLE(radv_device, device, _device);
 
+       if (!device)
+               return;
+
        if (device->trace_bo)
                device->ws->buffer_destroy(device->trace_bo);
 
+       if (device->gfx_init)
+               device->ws->buffer_destroy(device->gfx_init);
+
        for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
                for (unsigned q = 0; q < device->queue_count[i]; q++)
                        radv_queue_finish(&device->queues[i][q]);
                if (device->queue_count[i])
                        vk_free(&device->alloc, device->queues[i]);
+               if (device->empty_cs[i])
+                       device->ws->cs_destroy(device->empty_cs[i]);
+               if (device->flush_cs[i])
+                       device->ws->cs_destroy(device->flush_cs[i]);
        }
        radv_device_finish_meta(device);
 
@@ -1147,25 +1227,25 @@ radv_get_preamble_cs(struct radv_queue *queue,
                      uint32_t compute_scratch_size,
                     uint32_t esgs_ring_size,
                     uint32_t gsvs_ring_size,
-                     struct radeon_winsys_cs **preamble_cs)
+                     struct radeon_winsys_cs **initial_preamble_cs,
+                     struct radeon_winsys_cs **continue_preamble_cs)
 {
        struct radeon_winsys_bo *scratch_bo = NULL;
        struct radeon_winsys_bo *descriptor_bo = NULL;
        struct radeon_winsys_bo *compute_scratch_bo = NULL;
        struct radeon_winsys_bo *esgs_ring_bo = NULL;
        struct radeon_winsys_bo *gsvs_ring_bo = NULL;
-       struct radeon_winsys_cs *cs = NULL;
-
-       if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) {
-               *preamble_cs = NULL;
-               return VK_SUCCESS;
-       }
+       struct radeon_winsys_cs *dest_cs[2] = {0};
 
        if (scratch_size <= queue->scratch_size &&
            compute_scratch_size <= queue->compute_scratch_size &&
            esgs_ring_size <= queue->esgs_ring_size &&
-           gsvs_ring_size <= queue->gsvs_ring_size) {
-               *preamble_cs = queue->preamble_cs;
+           gsvs_ring_size <= queue->gsvs_ring_size &&
+           queue->initial_preamble_cs) {
+               *initial_preamble_cs = queue->initial_preamble_cs;
+               *continue_preamble_cs = queue->continue_preamble_cs;
+               if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
+                       *continue_preamble_cs = NULL;
                return VK_SUCCESS;
        }
 
@@ -1237,94 +1317,113 @@ radv_get_preamble_cs(struct radv_queue *queue,
        } else
                descriptor_bo = queue->descriptor_bo;
 
-       cs = queue->device->ws->cs_create(queue->device->ws,
-                                         queue->queue_family_index ? RING_COMPUTE : RING_GFX);
-       if (!cs)
-               goto fail;
+       for(int i = 0; i < 2; ++i) {
+               struct radeon_winsys_cs *cs = NULL;
+               cs = queue->device->ws->cs_create(queue->device->ws,
+                                                 queue->queue_family_index ? RING_COMPUTE : RING_GFX);
+               if (!cs)
+                       goto fail;
 
+               dest_cs[i] = cs;
 
-       if (scratch_bo)
-               queue->device->ws->cs_add_buffer(cs, scratch_bo, 8);
+               if (scratch_bo)
+                       queue->device->ws->cs_add_buffer(cs, scratch_bo, 8);
 
-       if (esgs_ring_bo)
-               queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8);
+               if (esgs_ring_bo)
+                       queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8);
 
-       if (gsvs_ring_bo)
-               queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8);
+               if (gsvs_ring_bo)
+                       queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8);
 
-       if (descriptor_bo)
-               queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8);
+               if (descriptor_bo)
+                       queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8);
 
-       if (descriptor_bo != queue->descriptor_bo) {
-               uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo);
+               if (descriptor_bo != queue->descriptor_bo) {
+                       uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo);
 
-               if (scratch_bo) {
-                       uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo);
-                       uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
-                               S_008F04_SWIZZLE_ENABLE(1);
-                       map[0] = scratch_va;
-                       map[1] = rsrc1;
+                       if (scratch_bo) {
+                               uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo);
+                               uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+                                                S_008F04_SWIZZLE_ENABLE(1);
+                               map[0] = scratch_va;
+                               map[1] = rsrc1;
+                       }
+
+                       if (esgs_ring_bo || gsvs_ring_bo)
+                               fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo);
+
+                       queue->device->ws->buffer_unmap(descriptor_bo);
                }
 
-               if (esgs_ring_bo || gsvs_ring_bo)
-                       fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo);
+               if (esgs_ring_bo || gsvs_ring_bo) {
+                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+                       radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+                       radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+
+                       if (queue->device->physical_device->rad_info.chip_class >= CIK) {
+                               radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
+                               radeon_emit(cs, esgs_ring_size >> 8);
+                               radeon_emit(cs, gsvs_ring_size >> 8);
+                       } else {
+                               radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2);
+                               radeon_emit(cs, esgs_ring_size >> 8);
+                               radeon_emit(cs, gsvs_ring_size >> 8);
+                       }
+               }
 
-               queue->device->ws->buffer_unmap(descriptor_bo);
-       }
+               if (descriptor_bo) {
+                       uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
+                                          R_00B130_SPI_SHADER_USER_DATA_VS_0,
+                                          R_00B230_SPI_SHADER_USER_DATA_GS_0,
+                                          R_00B330_SPI_SHADER_USER_DATA_ES_0,
+                                          R_00B430_SPI_SHADER_USER_DATA_HS_0,
+                                          R_00B530_SPI_SHADER_USER_DATA_LS_0};
 
-       if (esgs_ring_bo || gsvs_ring_bo) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+                       uint64_t va = queue->device->ws->buffer_get_va(descriptor_bo);
 
-               if (queue->device->physical_device->rad_info.chip_class >= CIK) {
-                       radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
-                       radeon_emit(cs, esgs_ring_size >> 8);
-                       radeon_emit(cs, gsvs_ring_size >> 8);
-               } else {
-                       radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2);
-                       radeon_emit(cs, esgs_ring_size >> 8);
-                       radeon_emit(cs, gsvs_ring_size >> 8);
+                       for (int i = 0; i < ARRAY_SIZE(regs); ++i) {
+                               radeon_set_sh_reg_seq(cs, regs[i], 2);
+                               radeon_emit(cs, va);
+                               radeon_emit(cs, va >> 32);
+                       }
                }
-       }
 
-       if (descriptor_bo) {
-               uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
-                                  R_00B130_SPI_SHADER_USER_DATA_VS_0,
-                                  R_00B230_SPI_SHADER_USER_DATA_GS_0,
-                                  R_00B330_SPI_SHADER_USER_DATA_ES_0,
-                                  R_00B430_SPI_SHADER_USER_DATA_HS_0,
-                                  R_00B530_SPI_SHADER_USER_DATA_LS_0};
+               if (compute_scratch_bo) {
+                       uint64_t scratch_va = queue->device->ws->buffer_get_va(compute_scratch_bo);
+                       uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+                                        S_008F04_SWIZZLE_ENABLE(1);
 
-               uint64_t va = queue->device->ws->buffer_get_va(descriptor_bo);
+                       queue->device->ws->cs_add_buffer(cs, compute_scratch_bo, 8);
 
-               for (int i = 0; i < ARRAY_SIZE(regs); ++i) {
-                       radeon_set_sh_reg_seq(cs, regs[i], 2);
-                       radeon_emit(cs, va);
-                       radeon_emit(cs, va >> 32);
+                       radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
+                       radeon_emit(cs, scratch_va);
+                       radeon_emit(cs, rsrc1);
                }
-       }
 
-       if (compute_scratch_bo) {
-               uint64_t scratch_va = queue->device->ws->buffer_get_va(compute_scratch_bo);
-               uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
-                                S_008F04_SWIZZLE_ENABLE(1);
-
-               queue->device->ws->cs_add_buffer(cs, compute_scratch_bo, 8);
+               if (!i) {
+                       si_cs_emit_cache_flush(cs,
+                                              queue->device->physical_device->rad_info.chip_class,
+                                              queue->queue_family_index == RING_COMPUTE &&
+                                                queue->device->physical_device->rad_info.chip_class >= CIK,
+                                              RADV_CMD_FLAG_INV_ICACHE |
+                                              RADV_CMD_FLAG_INV_SMEM_L1 |
+                                              RADV_CMD_FLAG_INV_VMEM_L1 |
+                                              RADV_CMD_FLAG_INV_GLOBAL_L2);
+               }
 
-               radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
-               radeon_emit(cs, scratch_va);
-               radeon_emit(cs, rsrc1);
+               if (!queue->device->ws->cs_finalize(cs))
+                       goto fail;
        }
 
-       if (!queue->device->ws->cs_finalize(cs))
-               goto fail;
+       if (queue->initial_preamble_cs)
+                       queue->device->ws->cs_destroy(queue->initial_preamble_cs);
 
-       if (queue->preamble_cs)
-               queue->device->ws->cs_destroy(queue->preamble_cs);
+       if (queue->continue_preamble_cs)
+                       queue->device->ws->cs_destroy(queue->continue_preamble_cs);
 
-       queue->preamble_cs = cs;
+       queue->initial_preamble_cs = dest_cs[0];
+       queue->continue_preamble_cs = dest_cs[1];
 
        if (scratch_bo != queue->scratch_bo) {
                if (queue->scratch_bo)
@@ -1361,11 +1460,15 @@ radv_get_preamble_cs(struct radv_queue *queue,
                queue->descriptor_bo = descriptor_bo;
        }
 
-       *preamble_cs = cs;
+       *initial_preamble_cs = queue->initial_preamble_cs;
+       *continue_preamble_cs = queue->continue_preamble_cs;
+       if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
+                       *continue_preamble_cs = NULL;
        return VK_SUCCESS;
 fail:
-       if (cs)
-               queue->device->ws->cs_destroy(cs);
+       for (int i = 0; i < ARRAY_SIZE(dest_cs); ++i)
+               if (dest_cs[i])
+                       queue->device->ws->cs_destroy(dest_cs[i]);
        if (descriptor_bo && descriptor_bo != queue->descriptor_bo)
                queue->device->ws->buffer_destroy(descriptor_bo);
        if (scratch_bo && scratch_bo != queue->scratch_bo)
@@ -1394,8 +1497,9 @@ VkResult radv_QueueSubmit(
        uint32_t scratch_size = 0;
        uint32_t compute_scratch_size = 0;
        uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
-       struct radeon_winsys_cs *preamble_cs = NULL;
+       struct radeon_winsys_cs *initial_preamble_cs = NULL, *continue_preamble_cs = NULL;
        VkResult result;
+       bool fence_emitted = false;
 
        /* Do this first so failing to allocate scratch buffers can't result in
         * partially executed submissions. */
@@ -1412,42 +1516,64 @@ VkResult radv_QueueSubmit(
                }
        }
 
-       result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, esgs_ring_size, gsvs_ring_size, &preamble_cs);
+       result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
+                                     esgs_ring_size, gsvs_ring_size,
+                                     &initial_preamble_cs, &continue_preamble_cs);
        if (result != VK_SUCCESS)
                return result;
 
        for (uint32_t i = 0; i < submitCount; i++) {
                struct radeon_winsys_cs **cs_array;
-               bool can_patch = true;
+               bool has_flush = !submitCount;
+               bool can_patch = !has_flush;
                uint32_t advance;
 
-               if (!pSubmits[i].commandBufferCount)
+               if (!pSubmits[i].commandBufferCount) {
+                       if (pSubmits[i].waitSemaphoreCount || pSubmits[i].signalSemaphoreCount) {
+                               ret = queue->device->ws->cs_submit(ctx, queue->queue_idx,
+                                                                  &queue->device->empty_cs[queue->queue_family_index],
+                                                                  1, NULL, NULL,
+                                                                  (struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores,
+                                                                  pSubmits[i].waitSemaphoreCount,
+                                                                  (struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores,
+                                                                  pSubmits[i].signalSemaphoreCount,
+                                                                  false, base_fence);
+                               if (ret) {
+                                       radv_loge("failed to submit CS %d\n", i);
+                                       abort();
+                               }
+                               fence_emitted = true;
+                       }
                        continue;
+               }
 
                cs_array = malloc(sizeof(struct radeon_winsys_cs *) *
-                                               pSubmits[i].commandBufferCount);
+                                               (pSubmits[i].commandBufferCount + has_flush));
+
+               if(has_flush)
+                       cs_array[0] = queue->device->flush_cs[queue->queue_family_index];
 
                for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
                        RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer,
                                         pSubmits[i].pCommandBuffers[j]);
                        assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 
-                       cs_array[j] = cmd_buffer->cs;
+                       cs_array[j + has_flush] = cmd_buffer->cs;
                        if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
                                can_patch = false;
                }
 
-               for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) {
+               for (uint32_t j = 0; j < pSubmits[i].commandBufferCount + has_flush; j += advance) {
                        advance = MIN2(max_cs_submission,
-                                      pSubmits[i].commandBufferCount - j);
+                                      pSubmits[i].commandBufferCount + has_flush - j);
                        bool b = j == 0;
-                       bool e = j + advance == pSubmits[i].commandBufferCount;
+                       bool e = j + advance == pSubmits[i].commandBufferCount + has_flush;
 
                        if (queue->device->trace_bo)
                                *queue->device->trace_id_ptr = 0;
 
                        ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
-                                                       advance, preamble_cs,
+                                                       advance, initial_preamble_cs, continue_preamble_cs,
                                                        (struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores,
                                                        b ? pSubmits[i].waitSemaphoreCount : 0,
                                                        (struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores,
@@ -1458,6 +1584,7 @@ VkResult radv_QueueSubmit(
                                radv_loge("failed to submit CS %d\n", i);
                                abort();
                        }
+                       fence_emitted = true;
                        if (queue->device->trace_bo) {
                                bool success = queue->device->ws->ctx_wait_idle(
                                                        queue->hw_ctx,
@@ -1475,10 +1602,10 @@ VkResult radv_QueueSubmit(
        }
 
        if (fence) {
-               if (!submitCount)
+               if (!fence_emitted)
                        ret = queue->device->ws->cs_submit(ctx, queue->queue_idx,
                                                           &queue->device->empty_cs[queue->queue_family_index],
-                                                          1, NULL, NULL, 0, NULL, 0,
+                                                          1, NULL, NULL, NULL, 0, NULL, 0,
                                                           false, base_fence);
 
                fence->submitted = true;
@@ -1541,6 +1668,21 @@ PFN_vkVoidFunction radv_GetDeviceProcAddr(
        return radv_lookup_entrypoint(pName);
 }
 
+bool radv_get_memory_fd(struct radv_device *device,
+                       struct radv_device_memory *memory,
+                       int *pFD)
+{
+       struct radeon_bo_metadata metadata;
+
+       if (memory->image) {
+               radv_init_metadata(device, memory->image, &metadata);
+               device->ws->buffer_set_metadata(memory->bo, &metadata);
+       }
+
+       return device->ws->buffer_get_fd(device->ws, memory->bo,
+                                        pFD);
+}
+
 VkResult radv_AllocateMemory(
        VkDevice                                    _device,
        const VkMemoryAllocateInfo*                 pAllocateInfo,
@@ -1552,6 +1694,7 @@ VkResult radv_AllocateMemory(
        VkResult result;
        enum radeon_bo_domain domain;
        uint32_t flags = 0;
+       const VkDedicatedAllocationMemoryAllocateInfoNV *dedicate_info = NULL;
        assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO);
 
        if (pAllocateInfo->allocationSize == 0) {
@@ -1560,11 +1703,29 @@ VkResult radv_AllocateMemory(
                return VK_SUCCESS;
        }
 
+       vk_foreach_struct(ext, pAllocateInfo->pNext) {
+               switch (ext->sType) {
+               case VK_STRUCTURE_TYPE_DEDICATED_ALLOCATION_MEMORY_ALLOCATE_INFO_NV:
+                       dedicate_info = (const VkDedicatedAllocationMemoryAllocateInfoNV *)ext;
+                       break;
+               default:
+                       break;
+               }
+       }
+
        mem = vk_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
        if (mem == NULL)
                return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
+       if (dedicate_info) {
+               mem->image = radv_image_from_handle(dedicate_info->image);
+               mem->buffer = radv_buffer_from_handle(dedicate_info->buffer);
+       } else {
+               mem->image = NULL;
+               mem->buffer = NULL;
+       }
+
        uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096);
        if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
            pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_CACHED)
@@ -1580,7 +1741,7 @@ VkResult radv_AllocateMemory(
        if (pAllocateInfo->memoryTypeIndex == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
                flags |= RADEON_FLAG_GTT_WC;
 
-       mem->bo = device->ws->buffer_create(device->ws, alloc_size, 32768,
+       mem->bo = device->ws->buffer_create(device->ws, alloc_size, 65536,
                                               domain, flags);
 
        if (!mem->bo) {
@@ -1895,7 +2056,7 @@ VkResult radv_CreateSemaphore(
        if (!sem)
                return VK_ERROR_OUT_OF_HOST_MEMORY;
 
-       *pSemaphore = (VkSemaphore)sem;
+       *pSemaphore = radeon_winsys_sem_to_handle(sem);
        return VK_SUCCESS;
 }
 
@@ -1905,11 +2066,10 @@ void radv_DestroySemaphore(
        const VkAllocationCallbacks*                pAllocator)
 {
        RADV_FROM_HANDLE(radv_device, device, _device);
-       struct radeon_winsys_sem *sem;
+       RADV_FROM_HANDLE(radeon_winsys_sem, sem, _semaphore);
        if (!_semaphore)
                return;
 
-       sem = (struct radeon_winsys_sem *)_semaphore;
        device->ws->destroy_sem(sem);
 }
 
@@ -2036,6 +2196,11 @@ si_tile_mode_index(const struct radv_image *image, unsigned level, bool stencil)
                return image->surface.tiling_index[level];
 }
 
+static uint32_t radv_surface_layer_count(struct radv_image_view *iview)
+{
+       return iview->type == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth : iview->layer_count;
+}
+
 static void
 radv_initialise_color_surface(struct radv_device *device,
                              struct radv_color_buffer_info *cb,
@@ -2067,7 +2232,7 @@ radv_initialise_color_surface(struct radv_device *device,
        va += iview->image->dcc_offset;
        cb->cb_dcc_base = va >> 8;
 
-       uint32_t max_slice = iview->type == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth : iview->layer_count;
+       uint32_t max_slice = radv_surface_layer_count(iview);
        cb->cb_color_view = S_028C6C_SLICE_START(iview->base_layer) |
                S_028C6C_SLICE_MAX(iview->base_layer + max_slice - 1);
 
@@ -2153,7 +2318,7 @@ radv_initialise_color_surface(struct radv_device *device,
                        cb->cb_color_info |= S_028C70_COMPRESSION(1);
 
        if (iview->image->cmask.size &&
-           (device->debug_flags & RADV_DEBUG_FAST_CLEARS))
+           !(device->debug_flags & RADV_DEBUG_NO_FAST_CLEARS))
                cb->cb_color_info |= S_028C70_FAST_CLEAR(1);
 
        if (iview->image->surface.dcc_size && level_info->dcc_enabled)
@@ -2221,7 +2386,7 @@ radv_initialise_ds_surface(struct radv_device *device,
        z_offs += iview->image->surface.level[level].offset;
        s_offs += iview->image->surface.stencil_level[level].offset;
 
-       uint32_t max_slice = iview->type == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth : iview->layer_count;
+       uint32_t max_slice = radv_surface_layer_count(iview);
        ds->db_depth_view = S_028008_SLICE_START(iview->base_layer) |
                S_028008_SLICE_MAX(iview->base_layer + max_slice - 1);
        ds->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1);
@@ -2260,7 +2425,7 @@ radv_initialise_ds_surface(struct radv_device *device,
                ds->db_stencil_info |= S_028044_TILE_MODE_INDEX(tile_mode_index);
        }
 
-       if (iview->image->htile.size && !level) {
+       if (iview->image->surface.htile_size && !level) {
                ds->db_z_info |= S_028040_TILE_SURFACE_ENABLE(1) |
                        S_028040_ALLOW_EXPCLEAR(1);
 
@@ -2283,7 +2448,7 @@ radv_initialise_ds_surface(struct radv_device *device,
                        ds->db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
 
                va = device->ws->buffer_get_va(iview->bo) + iview->image->offset +
-                    iview->image->htile.offset;
+                    iview->image->htile_offset;
                ds->db_htile_data_base = va >> 8;
                ds->db_htile_surface = S_028ABC_FULL_CACHE(1);
        } else {
@@ -2318,6 +2483,9 @@ VkResult radv_CreateFramebuffer(
                return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
        framebuffer->attachment_count = pCreateInfo->attachmentCount;
+       framebuffer->width = pCreateInfo->width;
+       framebuffer->height = pCreateInfo->height;
+       framebuffer->layers = pCreateInfo->layers;
        for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
                VkImageView _iview = pCreateInfo->pAttachments[i];
                struct radv_image_view *iview = radv_image_view_from_handle(_iview);
@@ -2327,12 +2495,11 @@ VkResult radv_CreateFramebuffer(
                } else if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
                        radv_initialise_ds_surface(device, &framebuffer->attachments[i].ds, iview);
                }
+               framebuffer->width = MIN2(framebuffer->width, iview->extent.width);
+               framebuffer->height = MIN2(framebuffer->height, iview->extent.height);
+               framebuffer->layers = MIN2(framebuffer->layers, radv_surface_layer_count(iview));
        }
 
-       framebuffer->width = pCreateInfo->width;
-       framebuffer->height = pCreateInfo->height;
-       framebuffer->layers = pCreateInfo->layers;
-
        *pFramebuffer = radv_framebuffer_to_handle(framebuffer);
        return VK_SUCCESS;
 }
@@ -2485,7 +2652,7 @@ radv_init_sampler(struct radv_device *device,
                             S_008F38_XY_MAG_FILTER(radv_tex_filter(pCreateInfo->magFilter, max_aniso)) |
                             S_008F38_XY_MIN_FILTER(radv_tex_filter(pCreateInfo->minFilter, max_aniso)) |
                             S_008F38_MIP_FILTER(radv_tex_mipfilter(pCreateInfo->mipmapMode)) |
-                            S_008F38_MIP_POINT_PRECLAMP(1) |
+                            S_008F38_MIP_POINT_PRECLAMP(0) |
                             S_008F38_DISABLE_LSB_CEIL(1) |
                             S_008F38_FILTER_PREC_FIX(1) |
                             S_008F38_ANISO_OVERRIDE(is_vi));