radeonsi: fix gl_ClipDistance and gl_ClipVertex for points
[mesa.git] / src / amd / vulkan / radv_cmd_buffer.c
index 4ae4e54f87ea5df6de0169b7cefc59b1cbdfb471..d29432b64e82580ec6e0af1d1ab7ffd076fca666 100644 (file)
@@ -36,6 +36,8 @@ static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
                                         struct radv_image *image,
                                         VkImageLayout src_layout,
                                         VkImageLayout dst_layout,
+                                        int src_family,
+                                        int dst_family,
                                         VkImageSubresourceRange range,
                                         VkImageAspectFlags pending_clears);
 
@@ -110,6 +112,25 @@ radv_dynamic_state_copy(struct radv_dynamic_state *dest,
                dest->stencil_reference = src->stencil_reference;
 }
 
+bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
+{
+       return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
+              cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK;
+}
+
+enum ring_type radv_queue_family_to_ring(int f) {
+       switch (f) {
+       case RADV_QUEUE_GENERAL:
+               return RING_GFX;
+       case RADV_QUEUE_COMPUTE:
+               return RING_COMPUTE;
+       case RADV_QUEUE_TRANSFER:
+               return RING_DMA;
+       default:
+               unreachable("Unknown queue family");
+       }
+}
+
 static VkResult radv_create_cmd_buffer(
        struct radv_device *                         device,
        struct radv_cmd_pool *                       pool,
@@ -118,7 +139,7 @@ static VkResult radv_create_cmd_buffer(
 {
        struct radv_cmd_buffer *cmd_buffer;
        VkResult result;
-
+       unsigned ring;
        cmd_buffer = vk_alloc(&pool->alloc, sizeof(*cmd_buffer), 8,
                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
        if (cmd_buffer == NULL)
@@ -132,14 +153,19 @@ static VkResult radv_create_cmd_buffer(
 
        if (pool) {
                list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
+               cmd_buffer->queue_family_index = pool->queue_family_index;
+
        } else {
                /* Init the pool_link so we can safefly call list_del when we destroy
                 * the command buffer
                 */
                list_inithead(&cmd_buffer->pool_link);
+               cmd_buffer->queue_family_index = RADV_QUEUE_GENERAL;
        }
 
-       cmd_buffer->cs = device->ws->cs_create(device->ws, RING_GFX);
+       ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
+
+       cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
        if (!cmd_buffer->cs) {
                result = VK_ERROR_OUT_OF_HOST_MEMORY;
                goto fail;
@@ -1046,8 +1072,8 @@ radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer)
        uint32_t ia_multi_vgt_param;
        uint32_t ls_hs_config = 0;
 
-       unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs,
-                                             4096);
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
+                                                          cmd_buffer->cs, 4096);
 
        if ((cmd_buffer->state.vertex_descriptors_dirty || cmd_buffer->state.vb_dirty) &&
            cmd_buffer->state.pipeline->num_vertex_attribs) {
@@ -1183,7 +1209,7 @@ static void radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buf
        radv_handle_image_transition(cmd_buffer,
                                     view->image,
                                     cmd_buffer->state.attachments[idx].current_layout,
-                                    att.layout, range,
+                                    att.layout, 0, 0, range,
                                     cmd_buffer->state.attachments[idx].pending_clear_aspects);
 
        cmd_buffer->state.attachments[idx].current_layout = att.layout;
@@ -1369,17 +1395,33 @@ VkResult radv_BeginCommandBuffer(
 
        /* setup initial configuration into command buffer */
        if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
-               /* Flush read caches at the beginning of CS not flushed by the kernel. */
-               cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_ICACHE |
-                       RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
-                       RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
-                       RADV_CMD_FLAG_INV_VMEM_L1 |
-                       RADV_CMD_FLAG_INV_SMEM_L1 |
-                       RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER |
-                       RADV_CMD_FLAG_INV_GLOBAL_L2;
-               si_init_config(&cmd_buffer->device->instance->physicalDevice, cmd_buffer);
-               radv_set_db_count_control(cmd_buffer);
-               si_emit_cache_flush(cmd_buffer);
+               switch (cmd_buffer->queue_family_index) {
+               case RADV_QUEUE_GENERAL:
+                       /* Flush read caches at the beginning of CS not flushed by the kernel. */
+                       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_ICACHE |
+                               RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+                               RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+                               RADV_CMD_FLAG_INV_VMEM_L1 |
+                               RADV_CMD_FLAG_INV_SMEM_L1 |
+                               RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER |
+                               RADV_CMD_FLAG_INV_GLOBAL_L2;
+                       si_init_config(&cmd_buffer->device->instance->physicalDevice, cmd_buffer);
+                       radv_set_db_count_control(cmd_buffer);
+                       si_emit_cache_flush(cmd_buffer);
+                       break;
+               case RADV_QUEUE_COMPUTE:
+                       cmd_buffer->state.flush_bits = RADV_CMD_FLAG_INV_ICACHE |
+                               RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+                               RADV_CMD_FLAG_INV_VMEM_L1 |
+                               RADV_CMD_FLAG_INV_SMEM_L1 |
+                               RADV_CMD_FLAG_INV_GLOBAL_L2;
+                       si_init_compute(&cmd_buffer->device->instance->physicalDevice, cmd_buffer);
+                       si_emit_cache_flush(cmd_buffer);
+                       break;
+               case RADV_QUEUE_TRANSFER:
+               default:
+                       break;
+               }
        }
 
        if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
@@ -1466,8 +1508,8 @@ void radv_CmdBindDescriptorSets(
        RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
        unsigned dyn_idx = 0;
 
-       unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs,
-                                             MAX_SETS * 4 * 6);
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
+                                                          cmd_buffer->cs, MAX_SETS * 4 * 6);
 
        for (unsigned i = 0; i < descriptorSetCount; ++i) {
                unsigned idx = i + firstSet;
@@ -1515,7 +1557,8 @@ VkResult radv_EndCommandBuffer(
 {
        RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 
-       si_emit_cache_flush(cmd_buffer);
+       if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
+               si_emit_cache_flush(cmd_buffer);
        if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
            cmd_buffer->record_fail)
                return VK_ERROR_OUT_OF_DEVICE_MEMORY;
@@ -1540,7 +1583,8 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
 
        ws->cs_add_buffer(cmd_buffer->cs, compute_shader->bo, 8);
 
-       unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 16);
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
+                                                          cmd_buffer->cs, 16);
 
        radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B830_COMPUTE_PGM_LO, 2);
        radeon_emit(cmd_buffer->cs, va >> 8);
@@ -1774,6 +1818,8 @@ VkResult radv_CreateCommandPool(
 
        list_inithead(&pool->cmd_buffers);
 
+       pool->queue_family_index = pCreateInfo->queueFamilyIndex;
+
        *pCmdPool = radv_cmd_pool_to_handle(pool);
 
        return VK_SUCCESS;
@@ -1823,8 +1869,8 @@ void radv_CmdBeginRenderPass(
        RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass);
        RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
 
-       unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs,
-                                             2048);
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
+                                                          cmd_buffer->cs, 2048);
 
        cmd_buffer->state.framebuffer = framebuffer;
        cmd_buffer->state.pass = pass;
@@ -1865,7 +1911,7 @@ void radv_CmdDraw(
        RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
        radv_cmd_buffer_flush_state(cmd_buffer);
 
-       unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
 
        struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
                                                             AC_UD_VS_BASE_VERTEX_START_INSTANCE);
@@ -1913,7 +1959,7 @@ void radv_CmdDrawIndexed(
        radv_cmd_buffer_flush_state(cmd_buffer);
        radv_emit_primitive_reset_index(cmd_buffer);
 
-       unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 14);
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 14);
 
        radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
        radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type);
@@ -2003,7 +2049,8 @@ radv_cmd_draw_indirect_count(VkCommandBuffer                             command
        RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
        radv_cmd_buffer_flush_state(cmd_buffer);
 
-       unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 14);
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
+                                                          cmd_buffer->cs, 14);
 
        radv_emit_indirect_draw(cmd_buffer, buffer, offset,
                                countBuffer, countBufferOffset, maxDrawCount, stride, false);
@@ -2031,7 +2078,7 @@ radv_cmd_draw_indexed_indirect_count(
        index_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->state.index_buffer->bo);
        index_va += cmd_buffer->state.index_buffer->offset + cmd_buffer->state.index_offset;
 
-       unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 21);
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 21);
 
        radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
        radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type);
@@ -2120,7 +2167,7 @@ void radv_CmdDispatch(
 
        radv_flush_compute_state(cmd_buffer);
 
-       unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 10);
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 10);
 
        struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
                                                             MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
@@ -2157,7 +2204,7 @@ void radv_CmdDispatchIndirect(
 
        radv_flush_compute_state(cmd_buffer);
 
-       unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 25);
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 25);
        struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
                                                             MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
        if (loc->sgpr_idx != -1) {
@@ -2172,16 +2219,24 @@ void radv_CmdDispatchIndirect(
                }
        }
 
-       radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_BASE, 2, 0) |
-                                   PKT3_SHADER_TYPE_S(1));
-       radeon_emit(cmd_buffer->cs, 1);
-       radeon_emit(cmd_buffer->cs, va);
-       radeon_emit(cmd_buffer->cs, va >> 32);
+       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
+               radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) |
+                                       PKT3_SHADER_TYPE_S(1));
+               radeon_emit(cmd_buffer->cs, va);
+               radeon_emit(cmd_buffer->cs, va >> 32);
+               radeon_emit(cmd_buffer->cs, 1);
+       } else {
+               radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_BASE, 2, 0) |
+                                       PKT3_SHADER_TYPE_S(1));
+               radeon_emit(cmd_buffer->cs, 1);
+               radeon_emit(cmd_buffer->cs, va);
+               radeon_emit(cmd_buffer->cs, va >> 32);
 
-       radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) |
-                                   PKT3_SHADER_TYPE_S(1));
-       radeon_emit(cmd_buffer->cs, 0);
-       radeon_emit(cmd_buffer->cs, 1);
+               radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) |
+                                       PKT3_SHADER_TYPE_S(1));
+               radeon_emit(cmd_buffer->cs, 0);
+               radeon_emit(cmd_buffer->cs, 1);
+       }
 
        assert(cmd_buffer->cs->cdw <= cdw_max);
 }
@@ -2207,7 +2262,7 @@ void radv_unaligned_dispatch(
 
        radv_flush_compute_state(cmd_buffer);
 
-       unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15);
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15);
 
        radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
        radeon_emit(cmd_buffer->cs,
@@ -2333,6 +2388,8 @@ static void radv_handle_cmask_image_transition(struct radv_cmd_buffer *cmd_buffe
                                               struct radv_image *image,
                                               VkImageLayout src_layout,
                                               VkImageLayout dst_layout,
+                                              unsigned src_queue_mask,
+                                              unsigned dst_queue_mask,
                                               VkImageSubresourceRange range,
                                               VkImageAspectFlags pending_clears)
 {
@@ -2341,8 +2398,8 @@ static void radv_handle_cmask_image_transition(struct radv_cmd_buffer *cmd_buffe
                        radv_initialise_cmask(cmd_buffer, image, 0xccccccccu);
                else
                        radv_initialise_cmask(cmd_buffer, image, 0xffffffffu);
-       } else if (radv_layout_has_cmask(image, src_layout) &&
-                  !radv_layout_has_cmask(image, dst_layout)) {
+       } else if (radv_layout_has_cmask(image, src_layout, src_queue_mask) &&
+                  !radv_layout_has_cmask(image, dst_layout, dst_queue_mask)) {
                radv_fast_clear_flush_image_inplace(cmd_buffer, image);
        }
 }
@@ -2383,16 +2440,40 @@ static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
                                         struct radv_image *image,
                                         VkImageLayout src_layout,
                                         VkImageLayout dst_layout,
+                                        int src_family,
+                                        int dst_family,
                                         VkImageSubresourceRange range,
                                         VkImageAspectFlags pending_clears)
 {
+       if (image->exclusive && src_family != dst_family) {
+               /* This is an acquire or a release operation and there will be
+                * a corresponding release/acquire. Do the transition in the
+                * most flexible queue. */
+
+               assert(src_family == cmd_buffer->queue_family_index ||
+                      dst_family == cmd_buffer->queue_family_index);
+
+               if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER)
+                       return;
+
+               if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
+                   (src_family == RADV_QUEUE_GENERAL ||
+                    dst_family == RADV_QUEUE_GENERAL))
+                       return;
+       }
+
+       unsigned src_queue_mask = radv_image_queue_family_mask(image, src_family);
+       unsigned dst_queue_mask = radv_image_queue_family_mask(image, dst_family);
+
        if (image->htile.size)
                radv_handle_depth_image_transition(cmd_buffer, image, src_layout,
                                                   dst_layout, range, pending_clears);
 
        if (image->cmask.size)
                radv_handle_cmask_image_transition(cmd_buffer, image, src_layout,
-                                                  dst_layout, range, pending_clears);
+                                                  dst_layout, src_queue_mask,
+                                                  dst_queue_mask, range,
+                                                  pending_clears);
 
        if (image->surface.dcc_size)
                radv_handle_dcc_image_transition(cmd_buffer, image, src_layout,
@@ -2456,6 +2537,8 @@ void radv_CmdPipelineBarrier(
                radv_handle_image_transition(cmd_buffer, image,
                                             pImageMemoryBarriers[i].oldLayout,
                                             pImageMemoryBarriers[i].newLayout,
+                                            pImageMemoryBarriers[i].srcQueueFamilyIndex,
+                                            pImageMemoryBarriers[i].dstQueueFamilyIndex,
                                             pImageMemoryBarriers[i].subresourceRange,
                                             0);
        }
@@ -2467,9 +2550,11 @@ void radv_CmdPipelineBarrier(
                case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
                case VK_ACCESS_INDEX_READ_BIT:
                case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
-               case VK_ACCESS_UNIFORM_READ_BIT:
                        flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1;
                        break;
+               case VK_ACCESS_UNIFORM_READ_BIT:
+                       flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 | RADV_CMD_FLAG_INV_SMEM_L1;
+                       break;
                case VK_ACCESS_SHADER_READ_BIT:
                        flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2;
                        break;
@@ -2499,7 +2584,7 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer,
 
        cmd_buffer->device->ws->cs_add_buffer(cs, event->bo, 8);
 
-       unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 12);
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 12);
 
        /* TODO: this is overkill. Probably should figure something out from
         * the stage mask. */
@@ -2566,7 +2651,7 @@ void radv_CmdWaitEvents(VkCommandBuffer commandBuffer,
 
                cmd_buffer->device->ws->cs_add_buffer(cs, event->bo, 8);
 
-               unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
+               MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
 
                radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
                radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
@@ -2586,6 +2671,8 @@ void radv_CmdWaitEvents(VkCommandBuffer commandBuffer,
                radv_handle_image_transition(cmd_buffer, image,
                                             pImageMemoryBarriers[i].oldLayout,
                                             pImageMemoryBarriers[i].newLayout,
+                                            pImageMemoryBarriers[i].srcQueueFamilyIndex,
+                                            pImageMemoryBarriers[i].dstQueueFamilyIndex,
                                             pImageMemoryBarriers[i].subresourceRange,
                                             0);
        }