automake: Link all libGL.so variants with -Bsymbolic.
[mesa.git] / src / amd / vulkan / radv_device.c
index adcc63a81727d367c60c3d1f34244f01aabea4ad..2d89e8635e73ff99322b8052dfd4a273bab29702 100644 (file)
@@ -91,10 +91,22 @@ static const VkExtensionProperties instance_extensions[] = {
 };
 
 static const VkExtensionProperties common_device_extensions[] = {
+       {
+               .extensionName = VK_KHR_DESCRIPTOR_UPDATE_TEMPLATE_EXTENSION_NAME,
+               .specVersion = 1,
+       },
+       {
+               .extensionName = VK_KHR_INCREMENTAL_PRESENT_EXTENSION_NAME,
+               .specVersion = 1,
+       },
        {
                .extensionName = VK_KHR_MAINTENANCE1_EXTENSION_NAME,
                .specVersion = 1,
        },
+       {
+               .extensionName = VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME,
+               .specVersion = 1,
+       },
        {
                .extensionName = VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME,
                .specVersion = 1,
@@ -389,7 +401,7 @@ radv_enumerate_devices(struct radv_instance *instance)
 
        instance->physicalDeviceCount = 0;
 
-       max_devices = drmGetDevices2(0, devices, sizeof(devices));
+       max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices));
        if (max_devices < 1)
                return VK_ERROR_INCOMPATIBLE_DRIVER;
 
@@ -405,9 +417,11 @@ radv_enumerate_devices(struct radv_instance *instance)
                        if (result == VK_SUCCESS)
                                ++instance->physicalDeviceCount;
                        else if (result != VK_ERROR_INCOMPATIBLE_DRIVER)
-                               return result;
+                               break;
                }
        }
+       drmFreeDevices(devices, max_devices);
+
        return result;
 }
 
@@ -452,7 +466,7 @@ void radv_GetPhysicalDeviceFeatures(
                .imageCubeArray                           = true,
                .independentBlend                         = true,
                .geometryShader                           = true,
-               .tessellationShader                       = false,
+               .tessellationShader                       = true,
                .sampleRateShading                        = false,
                .dualSrcBlend                             = true,
                .logicOp                                  = true,
@@ -471,7 +485,7 @@ void radv_GetPhysicalDeviceFeatures(
                .textureCompressionASTC_LDR               = false,
                .textureCompressionBC                     = true,
                .occlusionQueryPrecise                    = true,
-               .pipelineStatisticsQuery                  = false,
+               .pipelineStatisticsQuery                  = true,
                .vertexPipelineStoresAndAtomics           = true,
                .fragmentStoresAndAtomics                 = true,
                .shaderTessellationAndGeometryPointSize   = true,
@@ -489,8 +503,9 @@ void radv_GetPhysicalDeviceFeatures(
                .shaderFloat64                            = true,
                .shaderInt64                              = false,
                .shaderInt16                              = false,
-               .variableMultisampleRate                  = false,
-               .inheritedQueries                         = false,
+               .sparseBinding                            = true,
+               .variableMultisampleRate                  = true,
+               .inheritedQueries                         = true,
        };
 }
 
@@ -529,6 +544,20 @@ void radv_GetPhysicalDeviceProperties(
 {
        RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
        VkSampleCountFlags sample_counts = 0xf;
+
+       /* make sure that the entire descriptor set is addressable with a signed
+        * 32-bit int. So the sum of all limits scaled by descriptor size has to
+        * be at most 2 GiB. the combined image & samples object count as one of
+        * both. This limit is for the pipeline layout, not for the set layout, but
+        * there is no set limit, so we just set a pipeline limit. I don't think
+        * any app is going to hit this soon. */
+       size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) /
+                 (32 /* uniform buffer, 32 due to potential space wasted on alignement */ +
+                  32 /* storage buffer, 32 due to potential space wasted on alignement */ +
+                  32 /* sampler, largest when combined with image */ +
+                  64 /* sampled image */ +
+                  64 /* storage image */);
+
        VkPhysicalDeviceLimits limits = {
                .maxImageDimension1D                      = (1 << 14),
                .maxImageDimension2D                      = (1 << 14),
@@ -542,37 +571,37 @@ void radv_GetPhysicalDeviceProperties(
                .maxMemoryAllocationCount                 = UINT32_MAX,
                .maxSamplerAllocationCount                = 64 * 1024,
                .bufferImageGranularity                   = 64, /* A cache line */
-               .sparseAddressSpaceSize                   = 0,
+               .sparseAddressSpaceSize                   = 0xffffffffu, /* buffer max size */
                .maxBoundDescriptorSets                   = MAX_SETS,
-               .maxPerStageDescriptorSamplers            = 64,
-               .maxPerStageDescriptorUniformBuffers      = 64,
-               .maxPerStageDescriptorStorageBuffers      = 64,
-               .maxPerStageDescriptorSampledImages       = 64,
-               .maxPerStageDescriptorStorageImages       = 64,
-               .maxPerStageDescriptorInputAttachments    = 64,
-               .maxPerStageResources                     = 128,
-               .maxDescriptorSetSamplers                 = 256,
-               .maxDescriptorSetUniformBuffers           = 256,
-               .maxDescriptorSetUniformBuffersDynamic    = 256,
-               .maxDescriptorSetStorageBuffers           = 256,
-               .maxDescriptorSetStorageBuffersDynamic    = 256,
-               .maxDescriptorSetSampledImages            = 256,
-               .maxDescriptorSetStorageImages            = 256,
-               .maxDescriptorSetInputAttachments         = 256,
+               .maxPerStageDescriptorSamplers            = max_descriptor_set_size,
+               .maxPerStageDescriptorUniformBuffers      = max_descriptor_set_size,
+               .maxPerStageDescriptorStorageBuffers      = max_descriptor_set_size,
+               .maxPerStageDescriptorSampledImages       = max_descriptor_set_size,
+               .maxPerStageDescriptorStorageImages       = max_descriptor_set_size,
+               .maxPerStageDescriptorInputAttachments    = max_descriptor_set_size,
+               .maxPerStageResources                     = max_descriptor_set_size,
+               .maxDescriptorSetSamplers                 = max_descriptor_set_size,
+               .maxDescriptorSetUniformBuffers           = max_descriptor_set_size,
+               .maxDescriptorSetUniformBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
+               .maxDescriptorSetStorageBuffers           = max_descriptor_set_size,
+               .maxDescriptorSetStorageBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
+               .maxDescriptorSetSampledImages            = max_descriptor_set_size,
+               .maxDescriptorSetStorageImages            = max_descriptor_set_size,
+               .maxDescriptorSetInputAttachments         = max_descriptor_set_size,
                .maxVertexInputAttributes                 = 32,
                .maxVertexInputBindings                   = 32,
                .maxVertexInputAttributeOffset            = 2047,
                .maxVertexInputBindingStride              = 2048,
                .maxVertexOutputComponents                = 128,
-               .maxTessellationGenerationLevel           = 0,
-               .maxTessellationPatchSize                 = 0,
-               .maxTessellationControlPerVertexInputComponents = 0,
-               .maxTessellationControlPerVertexOutputComponents = 0,
-               .maxTessellationControlPerPatchOutputComponents = 0,
-               .maxTessellationControlTotalOutputComponents = 0,
-               .maxTessellationEvaluationInputComponents = 0,
-               .maxTessellationEvaluationOutputComponents = 0,
-               .maxGeometryShaderInvocations             = 32,
+               .maxTessellationGenerationLevel           = 64,
+               .maxTessellationPatchSize                 = 32,
+               .maxTessellationControlPerVertexInputComponents = 128,
+               .maxTessellationControlPerVertexOutputComponents = 128,
+               .maxTessellationControlPerPatchOutputComponents = 120,
+               .maxTessellationControlTotalOutputComponents = 4096,
+               .maxTessellationEvaluationInputComponents = 128,
+               .maxTessellationEvaluationOutputComponents = 128,
+               .maxGeometryShaderInvocations             = 127,
                .maxGeometryInputComponents               = 64,
                .maxGeometryOutputComponents              = 128,
                .maxGeometryOutputVertices                = 256,
@@ -625,8 +654,8 @@ void radv_GetPhysicalDeviceProperties(
                .sampledImageStencilSampleCounts          = sample_counts,
                .storageImageSampleCounts                 = VK_SAMPLE_COUNT_1_BIT,
                .maxSampleMaskWords                       = 1,
-               .timestampComputeAndGraphics              = false,
-               .timestampPeriod                          = 100000.0 / pdevice->rad_info.clock_crystal_freq,
+               .timestampComputeAndGraphics              = true,
+               .timestampPeriod                          = 1000000.0 / pdevice->rad_info.clock_crystal_freq,
                .maxClipDistances                         = 8,
                .maxCullDistances                         = 8,
                .maxCombinedClipAndCullDistances          = 8,
@@ -649,7 +678,7 @@ void radv_GetPhysicalDeviceProperties(
                .deviceID = pdevice->rad_info.pci_id,
                .deviceType = VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU,
                .limits = limits,
-               .sparseProperties = {0}, /* Broadwell doesn't do sparse. */
+               .sparseProperties = {0},
        };
 
        strcpy(pProperties->deviceName, pdevice->name);
@@ -660,7 +689,20 @@ void radv_GetPhysicalDeviceProperties2KHR(
        VkPhysicalDevice                            physicalDevice,
        VkPhysicalDeviceProperties2KHR             *pProperties)
 {
-       return radv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties);
+       radv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties);
+
+       vk_foreach_struct(ext, pProperties->pNext) {
+               switch (ext->sType) {
+               case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR: {
+                       VkPhysicalDevicePushDescriptorPropertiesKHR *properties =
+                               (VkPhysicalDevicePushDescriptorPropertiesKHR *) ext;
+                       properties->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;
+                       break;
+               }
+               default:
+                       break;
+               }
+       }
 }
 
 static void radv_get_physical_device_queue_family_properties(
@@ -687,8 +729,9 @@ static void radv_get_physical_device_queue_family_properties(
        if (*pCount >= 1) {
                *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) {
                        .queueFlags = VK_QUEUE_GRAPHICS_BIT |
-                       VK_QUEUE_COMPUTE_BIT |
-                       VK_QUEUE_TRANSFER_BIT,
+                                     VK_QUEUE_COMPUTE_BIT |
+                                     VK_QUEUE_TRANSFER_BIT |
+                                     VK_QUEUE_SPARSE_BINDING_BIT,
                        .queueCount = 1,
                        .timestampValidBits = 64,
                        .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 },
@@ -701,7 +744,9 @@ static void radv_get_physical_device_queue_family_properties(
            !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) {
                if (*pCount > idx) {
                        *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) {
-                               .queueFlags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT,
+                               .queueFlags = VK_QUEUE_COMPUTE_BIT |
+                                             VK_QUEUE_TRANSFER_BIT |
+                                             VK_QUEUE_SPARSE_BINDING_BIT,
                                .queueCount = pdevice->rad_info.compute_rings,
                                .timestampValidBits = 64,
                                .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 },
@@ -841,6 +886,10 @@ radv_queue_finish(struct radv_queue *queue)
                queue->device->ws->buffer_destroy(queue->esgs_ring_bo);
        if (queue->gsvs_ring_bo)
                queue->device->ws->buffer_destroy(queue->gsvs_ring_bo);
+       if (queue->tess_factor_ring_bo)
+               queue->device->ws->buffer_destroy(queue->tess_factor_ring_bo);
+       if (queue->tess_offchip_ring_bo)
+               queue->device->ws->buffer_destroy(queue->tess_offchip_ring_bo);
        if (queue->compute_scratch_bo)
                queue->device->ws->buffer_destroy(queue->compute_scratch_bo);
 }
@@ -868,6 +917,7 @@ radv_device_init_gs_info(struct radv_device *device)
        case CHIP_FIJI:
        case CHIP_POLARIS10:
        case CHIP_POLARIS11:
+       case CHIP_POLARIS12:
                device->gs_table_depth = 32;
                return;
        default:
@@ -958,6 +1008,12 @@ VkResult radv_CreateDevice(
 
        radv_device_init_gs_info(device);
 
+       device->tess_offchip_block_dw_size =
+               device->physical_device->rad_info.family == CHIP_HAWAII ? 4096 : 8192;
+       device->has_distributed_tess =
+               device->physical_device->rad_info.chip_class >= VI &&
+               device->physical_device->rad_info.max_se >= 2;
+
        result = radv_device_init_meta(device);
        if (result != VK_SUCCESS)
                goto fail;
@@ -993,6 +1049,22 @@ VkResult radv_CreateDevice(
                        break;
                }
                device->ws->cs_finalize(device->flush_cs[family]);
+
+               device->flush_shader_cs[family] = device->ws->cs_create(device->ws, family);
+               switch (family) {
+               case RADV_QUEUE_GENERAL:
+               case RADV_QUEUE_COMPUTE:
+                       si_cs_emit_cache_flush(device->flush_shader_cs[family],
+                                              device->physical_device->rad_info.chip_class,
+                                              family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
+                                              family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH) |
+                                              RADV_CMD_FLAG_INV_ICACHE |
+                                              RADV_CMD_FLAG_INV_SMEM_L1 |
+                                              RADV_CMD_FLAG_INV_VMEM_L1 |
+                                              RADV_CMD_FLAG_INV_GLOBAL_L2);
+                       break;
+               }
+               device->ws->cs_finalize(device->flush_shader_cs[family]);
        }
 
        if (getenv("RADV_TRACE_FILE")) {
@@ -1068,6 +1140,8 @@ void radv_DestroyDevice(
                        device->ws->cs_destroy(device->empty_cs[i]);
                if (device->flush_cs[i])
                        device->ws->cs_destroy(device->flush_cs[i]);
+               if (device->flush_shader_cs[i])
+                       device->ws->cs_destroy(device->flush_shader_cs[i]);
        }
        radv_device_finish_meta(device);
 
@@ -1172,20 +1246,30 @@ static void radv_dump_trace(struct radv_device *device,
 }
 
 static void
-fill_geom_rings(struct radv_queue *queue,
-               uint32_t *map,
-               uint32_t esgs_ring_size,
-               struct radeon_winsys_bo *esgs_ring_bo,
-               uint32_t gsvs_ring_size,
-               struct radeon_winsys_bo *gsvs_ring_bo)
+fill_geom_tess_rings(struct radv_queue *queue,
+                    uint32_t *map,
+                    bool add_sample_positions,
+                    uint32_t esgs_ring_size,
+                    struct radeon_winsys_bo *esgs_ring_bo,
+                    uint32_t gsvs_ring_size,
+                    struct radeon_winsys_bo *gsvs_ring_bo,
+                    uint32_t tess_factor_ring_size,
+                    struct radeon_winsys_bo *tess_factor_ring_bo,
+                    uint32_t tess_offchip_ring_size,
+                    struct radeon_winsys_bo *tess_offchip_ring_bo)
 {
        uint64_t esgs_va = 0, gsvs_va = 0;
+       uint64_t tess_factor_va = 0, tess_offchip_va = 0;
        uint32_t *desc = &map[4];
 
        if (esgs_ring_bo)
                esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo);
        if (gsvs_ring_bo)
                gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo);
+       if (tess_factor_ring_bo)
+               tess_factor_va = queue->device->ws->buffer_get_va(tess_factor_ring_bo);
+       if (tess_offchip_ring_bo)
+               tess_offchip_va = queue->device->ws->buffer_get_va(tess_offchip_ring_bo);
 
        /* stride 0, num records - size, add tid, swizzle, elsize4,
           index stride 64 */
@@ -1260,6 +1344,100 @@ fill_geom_rings(struct radv_queue *queue,
                S_008F0C_ELEMENT_SIZE(1) |
                S_008F0C_INDEX_STRIDE(1) |
                S_008F0C_ADD_TID_ENABLE(true);
+       desc += 4;
+
+       desc[0] = tess_factor_va;
+       desc[1] = S_008F04_BASE_ADDRESS_HI(tess_factor_va >> 32) |
+               S_008F04_STRIDE(0) |
+               S_008F04_SWIZZLE_ENABLE(false);
+       desc[2] = tess_factor_ring_size;
+       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+               S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+               S_008F0C_ELEMENT_SIZE(0) |
+               S_008F0C_INDEX_STRIDE(0) |
+               S_008F0C_ADD_TID_ENABLE(false);
+       desc += 4;
+
+       desc[0] = tess_offchip_va;
+       desc[1] = S_008F04_BASE_ADDRESS_HI(tess_offchip_va >> 32) |
+               S_008F04_STRIDE(0) |
+               S_008F04_SWIZZLE_ENABLE(false);
+       desc[2] = tess_offchip_ring_size;
+       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+               S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+               S_008F0C_ELEMENT_SIZE(0) |
+               S_008F0C_INDEX_STRIDE(0) |
+               S_008F0C_ADD_TID_ENABLE(false);
+       desc += 4;
+
+       /* add sample positions after all rings */
+       memcpy(desc, queue->device->sample_locations_1x, 8);
+       desc += 2;
+       memcpy(desc, queue->device->sample_locations_2x, 16);
+       desc += 4;
+       memcpy(desc, queue->device->sample_locations_4x, 32);
+       desc += 8;
+       memcpy(desc, queue->device->sample_locations_8x, 64);
+       desc += 16;
+       memcpy(desc, queue->device->sample_locations_16x, 128);
+}
+
+static unsigned
+radv_get_hs_offchip_param(struct radv_device *device, uint32_t *max_offchip_buffers_p)
+{
+       bool double_offchip_buffers = device->physical_device->rad_info.chip_class >= CIK &&
+               device->physical_device->rad_info.family != CHIP_CARRIZO &&
+               device->physical_device->rad_info.family != CHIP_STONEY;
+       unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
+       unsigned max_offchip_buffers = max_offchip_buffers_per_se *
+               device->physical_device->rad_info.max_se;
+       unsigned offchip_granularity;
+       unsigned hs_offchip_param;
+       switch (device->tess_offchip_block_dw_size) {
+       default:
+               assert(0);
+               /* fall through */
+       case 8192:
+               offchip_granularity = V_03093C_X_8K_DWORDS;
+               break;
+       case 4096:
+               offchip_granularity = V_03093C_X_4K_DWORDS;
+               break;
+       }
+
+       switch (device->physical_device->rad_info.chip_class) {
+       case SI:
+               max_offchip_buffers = MIN2(max_offchip_buffers, 126);
+               break;
+       case CIK:
+               max_offchip_buffers = MIN2(max_offchip_buffers, 508);
+               break;
+       case VI:
+       default:
+               max_offchip_buffers = MIN2(max_offchip_buffers, 512);
+               break;
+       }
+
+       *max_offchip_buffers_p = max_offchip_buffers;
+       if (device->physical_device->rad_info.chip_class >= CIK) {
+               if (device->physical_device->rad_info.chip_class >= VI)
+                       --max_offchip_buffers;
+               hs_offchip_param =
+                       S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
+                       S_03093C_OFFCHIP_GRANULARITY(offchip_granularity);
+       } else {
+               hs_offchip_param =
+                       S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers);
+       }
+       return hs_offchip_param;
 }
 
 static VkResult
@@ -1268,6 +1446,8 @@ radv_get_preamble_cs(struct radv_queue *queue,
                      uint32_t compute_scratch_size,
                     uint32_t esgs_ring_size,
                     uint32_t gsvs_ring_size,
+                    bool needs_tess_rings,
+                    bool needs_sample_positions,
                      struct radeon_winsys_cs **initial_preamble_cs,
                      struct radeon_winsys_cs **continue_preamble_cs)
 {
@@ -1276,12 +1456,32 @@ radv_get_preamble_cs(struct radv_queue *queue,
        struct radeon_winsys_bo *compute_scratch_bo = NULL;
        struct radeon_winsys_bo *esgs_ring_bo = NULL;
        struct radeon_winsys_bo *gsvs_ring_bo = NULL;
+       struct radeon_winsys_bo *tess_factor_ring_bo = NULL;
+       struct radeon_winsys_bo *tess_offchip_ring_bo = NULL;
        struct radeon_winsys_cs *dest_cs[2] = {0};
+       bool add_tess_rings = false, add_sample_positions = false;
+       unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0;
+       unsigned max_offchip_buffers;
+       unsigned hs_offchip_param = 0;
+       if (!queue->has_tess_rings) {
+               if (needs_tess_rings)
+                       add_tess_rings = true;
+       }
+       if (!queue->has_sample_positions) {
+               if (needs_sample_positions)
+                       add_sample_positions = true;
+       }
+       tess_factor_ring_size = 32768 * queue->device->physical_device->rad_info.max_se;
+       hs_offchip_param = radv_get_hs_offchip_param(queue->device,
+                                                    &max_offchip_buffers);
+       tess_offchip_ring_size = max_offchip_buffers *
+               queue->device->tess_offchip_block_dw_size * 4;
 
        if (scratch_size <= queue->scratch_size &&
            compute_scratch_size <= queue->compute_scratch_size &&
            esgs_ring_size <= queue->esgs_ring_size &&
            gsvs_ring_size <= queue->gsvs_ring_size &&
+           !add_tess_rings && !add_sample_positions &&
            queue->initial_preamble_cs) {
                *initial_preamble_cs = queue->initial_preamble_cs;
                *continue_preamble_cs = queue->continue_preamble_cs;
@@ -1339,12 +1539,38 @@ radv_get_preamble_cs(struct radv_queue *queue,
                gsvs_ring_size = queue->gsvs_ring_size;
        }
 
+       if (add_tess_rings) {
+               tess_factor_ring_bo = queue->device->ws->buffer_create(queue->device->ws,
+                                                                      tess_factor_ring_size,
+                                                                      256,
+                                                                      RADEON_DOMAIN_VRAM,
+                                                                      RADEON_FLAG_NO_CPU_ACCESS);
+               if (!tess_factor_ring_bo)
+                       goto fail;
+               tess_offchip_ring_bo = queue->device->ws->buffer_create(queue->device->ws,
+                                                                      tess_offchip_ring_size,
+                                                                      256,
+                                                                      RADEON_DOMAIN_VRAM,
+                                                                      RADEON_FLAG_NO_CPU_ACCESS);
+               if (!tess_offchip_ring_bo)
+                       goto fail;
+       } else {
+               tess_factor_ring_bo = queue->tess_factor_ring_bo;
+               tess_offchip_ring_bo = queue->tess_offchip_ring_bo;
+       }
+
        if (scratch_bo != queue->scratch_bo ||
            esgs_ring_bo != queue->esgs_ring_bo ||
-           gsvs_ring_bo != queue->gsvs_ring_bo) {
+           gsvs_ring_bo != queue->gsvs_ring_bo ||
+           tess_factor_ring_bo != queue->tess_factor_ring_bo ||
+           tess_offchip_ring_bo != queue->tess_offchip_ring_bo || add_sample_positions) {
                uint32_t size = 0;
-               if (gsvs_ring_bo || esgs_ring_bo)
-                       size = 80; /* 2 dword + 2 padding + 4 dword * 4 */
+               if (gsvs_ring_bo || esgs_ring_bo ||
+                   tess_factor_ring_bo || tess_offchip_ring_bo || add_sample_positions) {
+                       size = 112; /* 2 dword + 2 padding + 4 dword * 6 */
+                       if (add_sample_positions)
+                               size += 256; /* 32+16+8+4+2+1 samples * 4 * 2 = 248 bytes. */
+               }
                else if (scratch_bo)
                        size = 8; /* 2 dword */
 
@@ -1376,6 +1602,12 @@ radv_get_preamble_cs(struct radv_queue *queue,
                if (gsvs_ring_bo)
                        queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8);
 
+               if (tess_factor_ring_bo)
+                       queue->device->ws->cs_add_buffer(cs, tess_factor_ring_bo, 8);
+
+               if (tess_offchip_ring_bo)
+                       queue->device->ws->cs_add_buffer(cs, tess_offchip_ring_bo, 8);
+
                if (descriptor_bo)
                        queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8);
 
@@ -1390,18 +1622,25 @@ radv_get_preamble_cs(struct radv_queue *queue,
                                map[1] = rsrc1;
                        }
 
-                       if (esgs_ring_bo || gsvs_ring_bo)
-                               fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo);
+                       if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo ||
+                           add_sample_positions)
+                               fill_geom_tess_rings(queue, map, add_sample_positions,
+                                                    esgs_ring_size, esgs_ring_bo,
+                                                    gsvs_ring_size, gsvs_ring_bo,
+                                                    tess_factor_ring_size, tess_factor_ring_bo,
+                                                    tess_offchip_ring_size, tess_offchip_ring_bo);
 
                        queue->device->ws->buffer_unmap(descriptor_bo);
                }
 
-               if (esgs_ring_bo || gsvs_ring_bo) {
+               if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo) {
                        radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                        radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
                        radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                        radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+               }
 
+               if (esgs_ring_bo || gsvs_ring_bo) {
                        if (queue->device->physical_device->rad_info.chip_class >= CIK) {
                                radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
                                radeon_emit(cs, esgs_ring_size >> 8);
@@ -1413,6 +1652,24 @@ radv_get_preamble_cs(struct radv_queue *queue,
                        }
                }
 
+               if (tess_factor_ring_bo) {
+                       uint64_t tf_va = queue->device->ws->buffer_get_va(tess_factor_ring_bo);
+                       if (queue->device->physical_device->rad_info.chip_class >= CIK) {
+                               radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE,
+                                                      S_030938_SIZE(tess_factor_ring_size / 4));
+                               radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE,
+                                                      tf_va >> 8);
+                               radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, hs_offchip_param);
+                       } else {
+                               radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE,
+                                                     S_008988_SIZE(tess_factor_ring_size / 4));
+                               radeon_set_config_reg(cs, R_0089B8_VGT_TF_MEMORY_BASE,
+                                                     tf_va >> 8);
+                               radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM,
+                                                     hs_offchip_param);
+                       }
+               }
+
                if (descriptor_bo) {
                        uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
                                           R_00B130_SPI_SHADER_USER_DATA_VS_0,
@@ -1494,6 +1751,15 @@ radv_get_preamble_cs(struct radv_queue *queue,
                queue->gsvs_ring_size = gsvs_ring_size;
        }
 
+       if (tess_factor_ring_bo != queue->tess_factor_ring_bo) {
+               queue->tess_factor_ring_bo = tess_factor_ring_bo;
+       }
+
+       if (tess_offchip_ring_bo != queue->tess_offchip_ring_bo) {
+               queue->tess_offchip_ring_bo = tess_offchip_ring_bo;
+               queue->has_tess_rings = true;
+       }
+
        if (descriptor_bo != queue->descriptor_bo) {
                if (queue->descriptor_bo)
                        queue->device->ws->buffer_destroy(queue->descriptor_bo);
@@ -1501,6 +1767,9 @@ radv_get_preamble_cs(struct radv_queue *queue,
                queue->descriptor_bo = descriptor_bo;
        }
 
+       if (add_sample_positions)
+               queue->has_sample_positions = true;
+
        *initial_preamble_cs = queue->initial_preamble_cs;
        *continue_preamble_cs = queue->continue_preamble_cs;
        if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
@@ -1520,6 +1789,10 @@ fail:
                queue->device->ws->buffer_destroy(esgs_ring_bo);
        if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo)
                queue->device->ws->buffer_destroy(gsvs_ring_bo);
+       if (tess_factor_ring_bo && tess_factor_ring_bo != queue->tess_factor_ring_bo)
+               queue->device->ws->buffer_destroy(tess_factor_ring_bo);
+       if (tess_offchip_ring_bo && tess_offchip_ring_bo != queue->tess_offchip_ring_bo)
+               queue->device->ws->buffer_destroy(tess_offchip_ring_bo);
        return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 }
 
@@ -1541,6 +1814,8 @@ VkResult radv_QueueSubmit(
        struct radeon_winsys_cs *initial_preamble_cs = NULL, *continue_preamble_cs = NULL;
        VkResult result;
        bool fence_emitted = false;
+       bool tess_rings_needed = false;
+       bool sample_positions_needed = false;
 
        /* Do this first so failing to allocate scratch buffers can't result in
         * partially executed submissions. */
@@ -1554,18 +1829,21 @@ VkResult radv_QueueSubmit(
                                                    cmd_buffer->compute_scratch_size_needed);
                        esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
                        gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
+                       tess_rings_needed |= cmd_buffer->tess_rings_needed;
+                       sample_positions_needed |= cmd_buffer->sample_positions_needed;
                }
        }
 
        result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
-                                     esgs_ring_size, gsvs_ring_size,
+                                     esgs_ring_size, gsvs_ring_size, tess_rings_needed,
+                                     sample_positions_needed,
                                      &initial_preamble_cs, &continue_preamble_cs);
        if (result != VK_SUCCESS)
                return result;
 
        for (uint32_t i = 0; i < submitCount; i++) {
                struct radeon_winsys_cs **cs_array;
-               bool do_flush = !i;
+               bool do_flush = !i || pSubmits[i].pWaitDstStageMask;
                bool can_patch = !do_flush;
                uint32_t advance;
 
@@ -1592,7 +1870,9 @@ VkResult radv_QueueSubmit(
                                                (pSubmits[i].commandBufferCount + do_flush));
 
                if(do_flush)
-                       cs_array[0] = queue->device->flush_cs[queue->queue_family_index];
+                       cs_array[0] = pSubmits[i].waitSemaphoreCount ?
+                               queue->device->flush_shader_cs[queue->queue_family_index] :
+                               queue->device->flush_cs[queue->queue_family_index];
 
                for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
                        RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer,
@@ -1959,13 +2239,89 @@ VkResult radv_BindImageMemory(
        return VK_SUCCESS;
 }
 
-VkResult radv_QueueBindSparse(
-       VkQueue                                     queue,
+
+static void
+radv_sparse_buffer_bind_memory(struct radv_device *device,
+                               const VkSparseBufferMemoryBindInfo *bind)
+{
+       RADV_FROM_HANDLE(radv_buffer, buffer, bind->buffer);
+
+       for (uint32_t i = 0; i < bind->bindCount; ++i) {
+               struct radv_device_memory *mem = NULL;
+
+               if (bind->pBinds[i].memory != VK_NULL_HANDLE)
+                       mem = radv_device_memory_from_handle(bind->pBinds[i].memory);
+
+               device->ws->buffer_virtual_bind(buffer->bo,
+                                               bind->pBinds[i].resourceOffset,
+                                               bind->pBinds[i].size,
+                                               mem ? mem->bo : NULL,
+                                               bind->pBinds[i].memoryOffset);
+       }
+}
+
+static void
+radv_sparse_image_opaque_bind_memory(struct radv_device *device,
+                                     const VkSparseImageOpaqueMemoryBindInfo *bind)
+{
+       RADV_FROM_HANDLE(radv_image, image, bind->image);
+
+       for (uint32_t i = 0; i < bind->bindCount; ++i) {
+               struct radv_device_memory *mem = NULL;
+
+               if (bind->pBinds[i].memory != VK_NULL_HANDLE)
+                       mem = radv_device_memory_from_handle(bind->pBinds[i].memory);
+
+               device->ws->buffer_virtual_bind(image->bo,
+                                               bind->pBinds[i].resourceOffset,
+                                               bind->pBinds[i].size,
+                                               mem ? mem->bo : NULL,
+                                               bind->pBinds[i].memoryOffset);
+       }
+}
+
+ VkResult radv_QueueBindSparse(
+       VkQueue                                     _queue,
        uint32_t                                    bindInfoCount,
        const VkBindSparseInfo*                     pBindInfo,
-       VkFence                                     fence)
+       VkFence                                     _fence)
 {
-       stub_return(VK_ERROR_INCOMPATIBLE_DRIVER);
+       RADV_FROM_HANDLE(radv_fence, fence, _fence);
+       RADV_FROM_HANDLE(radv_queue, queue, _queue);
+       struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL;
+       bool fence_emitted = false;
+
+       for (uint32_t i = 0; i < bindInfoCount; ++i) {
+               for (uint32_t j = 0; j < pBindInfo[i].bufferBindCount; ++j) {
+                       radv_sparse_buffer_bind_memory(queue->device,
+                                                      pBindInfo[i].pBufferBinds + j);
+               }
+
+               for (uint32_t j = 0; j < pBindInfo[i].imageOpaqueBindCount; ++j) {
+                       radv_sparse_image_opaque_bind_memory(queue->device,
+                                                            pBindInfo[i].pImageOpaqueBinds + j);
+               }
+
+               if (pBindInfo[i].waitSemaphoreCount || pBindInfo[i].signalSemaphoreCount) {
+                       queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
+                                                    &queue->device->empty_cs[queue->queue_family_index],
+                                                    1, NULL, NULL,
+                                                    (struct radeon_winsys_sem **)pBindInfo[i].pWaitSemaphores,
+                                                    pBindInfo[i].waitSemaphoreCount,
+                                                    (struct radeon_winsys_sem **)pBindInfo[i].pSignalSemaphores,
+                                                    pBindInfo[i].signalSemaphoreCount,
+                                                    false, base_fence);
+                       fence_emitted = true;
+                       if (fence)
+                               fence->submitted = true;
+               }
+       }
+
+       if (fence && !fence_emitted) {
+               fence->signalled = true;
+       }
+
+       return VK_SUCCESS;
 }
 
 VkResult radv_CreateFence(
@@ -2307,8 +2663,8 @@ radv_initialise_color_surface(struct radv_device *device,
        cb->cb_color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == VK_SWIZZLE_1) |
                S_028C74_TILE_MODE_INDEX(tile_mode_index);
 
-       if (iview->image->samples > 1) {
-               unsigned log_samples = util_logbase2(iview->image->samples);
+       if (iview->image->info.samples > 1) {
+               unsigned log_samples = util_logbase2(iview->image->info.samples);
 
                cb->cb_color_attrib |= S_028C74_NUM_SAMPLES(log_samples) |
                        S_028C74_NUM_FRAGMENTS(log_samples);
@@ -2372,7 +2728,7 @@ radv_initialise_color_surface(struct radv_device *device,
                                    format != V_028C70_COLOR_24_8) |
                S_028C70_NUMBER_TYPE(ntype) |
                S_028C70_ENDIAN(endian);
-       if (iview->image->samples > 1)
+       if (iview->image->info.samples > 1)
                if (iview->image->fmask.size)
                        cb->cb_color_info |= S_028C70_COMPRESSION(1);
 
@@ -2385,7 +2741,7 @@ radv_initialise_color_surface(struct radv_device *device,
 
        if (device->physical_device->rad_info.chip_class >= VI) {
                unsigned max_uncompressed_block_size = 2;
-               if (iview->image->samples > 1) {
+               if (iview->image->info.samples > 1) {
                        if (iview->image->surface.bpe == 1)
                                max_uncompressed_block_size = 0;
                        else if (iview->image->surface.bpe == 2)
@@ -2413,6 +2769,7 @@ radv_initialise_ds_surface(struct radv_device *device,
        unsigned format;
        uint64_t va, s_offs, z_offs;
        const struct radeon_surf_level *level_info = &iview->image->surface.level[level];
+       bool stencil_only = false;
        memset(ds, 0, sizeof(*ds));
        switch (iview->vk_format) {
        case VK_FORMAT_D24_UNORM_S8_UINT:
@@ -2431,6 +2788,10 @@ radv_initialise_ds_surface(struct radv_device *device,
                        S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
                ds->offset_scale = 1.0f;
                break;
+       case VK_FORMAT_S8_UINT:
+               stencil_only = true;
+               level_info = &iview->image->surface.stencil_level[level];
+               break;
        default:
                break;
        }
@@ -2448,8 +2809,8 @@ radv_initialise_ds_surface(struct radv_device *device,
        ds->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1);
        ds->db_z_info = S_028040_FORMAT(format) | S_028040_ZRANGE_PRECISION(1);
 
-       if (iview->image->samples > 1)
-               ds->db_z_info |= S_028040_NUM_SAMPLES(util_logbase2(iview->image->samples));
+       if (iview->image->info.samples > 1)
+               ds->db_z_info |= S_028040_NUM_SAMPLES(util_logbase2(iview->image->info.samples));
 
        if (iview->image->surface.flags & RADEON_SURF_SBUFFER)
                ds->db_stencil_info = S_028044_FORMAT(V_028044_STENCIL_8);
@@ -2465,6 +2826,9 @@ radv_initialise_ds_surface(struct radv_device *device,
                unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index];
                unsigned macro_mode = info->cik_macrotile_mode_array[macro_index];
 
+               if (stencil_only)
+                       tile_mode = stencil_tile_mode;
+
                ds->db_depth_info |=
                        S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) |
                        S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) |
@@ -2482,24 +2846,9 @@ radv_initialise_ds_surface(struct radv_device *device,
        }
 
        if (iview->image->surface.htile_size && !level) {
-               ds->db_z_info |= S_028040_TILE_SURFACE_ENABLE(1) |
-                       S_028040_ALLOW_EXPCLEAR(1);
-
-               if (iview->image->surface.flags & RADEON_SURF_SBUFFER) {
-                       /* Workaround: For a not yet understood reason, the
-                        * combination of MSAA, fast stencil clear and stencil
-                        * decompress messes with subsequent stencil buffer
-                        * uses. Problem was reproduced on Verde, Bonaire,
-                        * Tonga, and Carrizo.
-                        *
-                        * Disabling EXPCLEAR works around the problem.
-                        *
-                        * Check piglit's arb_texture_multisample-stencil-clear
-                        * test if you want to try changing this.
-                        */
-                       if (iview->image->samples <= 1)
-                               ds->db_stencil_info |= S_028044_ALLOW_EXPCLEAR(1);
-               } else
+               ds->db_z_info |= S_028040_TILE_SURFACE_ENABLE(1);
+
+               if (!(iview->image->surface.flags & RADEON_SURF_SBUFFER))
                        /* Use all of the htile_buffer for depth if there's no stencil. */
                        ds->db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);