radv: set correct INDEX_TYPE for indexed indirect draws on GFX9
[mesa.git] / src / amd / vulkan / radv_cmd_buffer.c
index 5f22733a7d9e5cf7ceb4ee5649844ac82eba51c6..39dfffc3762479abcada72da13e7a6b3c5d893bc 100644 (file)
@@ -78,19 +78,23 @@ const struct radv_dynamic_state default_dynamic_state = {
        },
 };
 
-void
+static void
 radv_dynamic_state_copy(struct radv_dynamic_state *dest,
                        const struct radv_dynamic_state *src,
                        uint32_t copy_mask)
 {
+       /* Make sure to copy the number of viewports/scissors because they can
+        * only be specified at pipeline creation time.
+        */
+       dest->viewport.count = src->viewport.count;
+       dest->scissor.count = src->scissor.count;
+
        if (copy_mask & (1 << VK_DYNAMIC_STATE_VIEWPORT)) {
-               dest->viewport.count = src->viewport.count;
                typed_memcpy(dest->viewport.viewports, src->viewport.viewports,
                             src->viewport.count);
        }
 
        if (copy_mask & (1 << VK_DYNAMIC_STATE_SCISSOR)) {
-               dest->scissor.count = src->scissor.count;
                typed_memcpy(dest->scissor.scissors, src->scissor.scissors,
                             src->scissor.count);
        }
@@ -353,7 +357,7 @@ void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
        if (!device->trace_bo)
                return;
 
-       va = device->ws->buffer_get_va(device->trace_bo);
+       va = radv_buffer_get_va(device->trace_bo);
        if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
                va += 4;
 
@@ -369,7 +373,7 @@ void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
 static void
 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer)
 {
-       if (cmd_buffer->device->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
+       if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
                enum radv_cmd_flush_bits flags;
 
                /* Force wait for graphics/compute engines to be idle. */
@@ -398,7 +402,7 @@ radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer,
        if (!device->trace_bo)
                return;
 
-       va = device->ws->buffer_get_va(device->trace_bo);
+       va = radv_buffer_get_va(device->trace_bo);
 
        switch (ring) {
        case RING_GFX:
@@ -432,7 +436,7 @@ radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer)
        if (!device->trace_bo)
                return;
 
-       va = device->ws->buffer_get_va(device->trace_bo) + 24;
+       va = radv_buffer_get_va(device->trace_bo) + 24;
 
        MAYBE_UNUSED unsigned cdw_max = radeon_check_space(device->ws,
                                                           cmd_buffer->cs, 4 + MAX_SETS * 2);
@@ -484,13 +488,6 @@ radv_emit_graphics_depth_stencil_state(struct radv_cmd_buffer *cmd_buffer,
        radeon_set_context_reg(cmd_buffer->cs, R_028010_DB_RENDER_OVERRIDE2, ds->db_render_override2);
 }
 
-/* 12.4 fixed-point */
-static unsigned radv_pack_float_12p4(float x)
-{
-       return x <= 0    ? 0 :
-              x >= 4096 ? 0xffff : x * 16;
-}
-
 struct ac_userdata_info *
 radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
                      gl_shader_stage stage,
@@ -528,13 +525,13 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
        radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_mask[0]);
        radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_mask[1]);
 
-       radeon_set_context_reg(cmd_buffer->cs, CM_R_028804_DB_EQAA, ms->db_eqaa);
-       radeon_set_context_reg(cmd_buffer->cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, ms->pa_sc_mode_cntl_1);
+       radeon_set_context_reg(cmd_buffer->cs, R_028804_DB_EQAA, ms->db_eqaa);
+       radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, ms->pa_sc_mode_cntl_1);
 
        if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
                return;
 
-       radeon_set_context_reg_seq(cmd_buffer->cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2);
+       radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2);
        radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
        radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_config);
 
@@ -584,19 +581,10 @@ radv_emit_graphics_raster_state(struct radv_cmd_buffer *cmd_buffer,
 
        radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL,
                               raster->pa_cl_clip_cntl);
-
        radeon_set_context_reg(cmd_buffer->cs, R_0286D4_SPI_INTERP_CONTROL_0,
                               raster->spi_interp_control);
-
-       radeon_set_context_reg_seq(cmd_buffer->cs, R_028A00_PA_SU_POINT_SIZE, 2);
-       unsigned tmp = (unsigned)(1.0 * 8.0);
-       radeon_emit(cmd_buffer->cs, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
-       radeon_emit(cmd_buffer->cs, S_028A04_MIN_SIZE(radv_pack_float_12p4(0)) |
-                   S_028A04_MAX_SIZE(radv_pack_float_12p4(8192/2))); /* R_028A04_PA_SU_POINT_MINMAX */
-
        radeon_set_context_reg(cmd_buffer->cs, R_028BE4_PA_SU_VTX_CNTL,
                               raster->pa_su_vtx_cntl);
-
        radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL,
                               raster->pa_su_sc_mode_cntl);
 }
@@ -616,7 +604,7 @@ radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer,
                struct ac_vs_output_info *outinfo)
 {
        struct radeon_winsys *ws = cmd_buffer->device->ws;
-       uint64_t va = ws->buffer_get_va(shader->bo) + shader->bo_offset;
+       uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
        unsigned export_count;
 
        ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8);
@@ -666,7 +654,7 @@ radv_emit_hw_es(struct radv_cmd_buffer *cmd_buffer,
                struct ac_es_output_info *outinfo)
 {
        struct radeon_winsys *ws = cmd_buffer->device->ws;
-       uint64_t va = ws->buffer_get_va(shader->bo) + shader->bo_offset;
+       uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
 
        ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8);
        radv_emit_prefetch(cmd_buffer, va, shader->code_size);
@@ -685,7 +673,7 @@ radv_emit_hw_ls(struct radv_cmd_buffer *cmd_buffer,
                struct radv_shader_variant *shader)
 {
        struct radeon_winsys *ws = cmd_buffer->device->ws;
-       uint64_t va = ws->buffer_get_va(shader->bo) + shader->bo_offset;
+       uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
        uint32_t rsrc2 = shader->rsrc2;
 
        ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8);
@@ -710,7 +698,7 @@ radv_emit_hw_hs(struct radv_cmd_buffer *cmd_buffer,
                struct radv_shader_variant *shader)
 {
        struct radeon_winsys *ws = cmd_buffer->device->ws;
-       uint64_t va = ws->buffer_get_va(shader->bo) + shader->bo_offset;
+       uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
 
        ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8);
        radv_emit_prefetch(cmd_buffer, va, shader->code_size);
@@ -845,7 +833,7 @@ radv_emit_geometry_shader(struct radv_cmd_buffer *cmd_buffer,
                               S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
                               S_028B90_ENABLE(gs_num_invocations > 0));
 
-       va = ws->buffer_get_va(gs->bo) + gs->bo_offset;
+       va = radv_buffer_get_va(gs->bo) + gs->bo_offset;
        ws->cs_add_buffer(cmd_buffer->cs, gs->bo, 8);
        radv_emit_prefetch(cmd_buffer, va, gs->code_size);
 
@@ -886,7 +874,7 @@ radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer,
        assert (pipeline->shaders[MESA_SHADER_FRAGMENT]);
 
        ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
-       va = ws->buffer_get_va(ps->bo) + ps->bo_offset;
+       va = radv_buffer_get_va(ps->bo) + ps->bo_offset;
        ws->cs_add_buffer(cmd_buffer->cs, ps->bo, 8);
        radv_emit_prefetch(cmd_buffer, va, ps->code_size);
 
@@ -935,19 +923,17 @@ radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer,
        }
 }
 
-static void polaris_set_vgt_vertex_reuse(struct radv_cmd_buffer *cmd_buffer,
-                                        struct radv_pipeline *pipeline)
+static void
+radv_emit_vgt_vertex_reuse(struct radv_cmd_buffer *cmd_buffer,
+                          struct radv_pipeline *pipeline)
 {
-       uint32_t vtx_reuse_depth = 30;
+       struct radeon_winsys_cs *cs = cmd_buffer->cs;
+
        if (cmd_buffer->device->physical_device->rad_info.family < CHIP_POLARIS10)
                return;
 
-       if (pipeline->shaders[MESA_SHADER_TESS_EVAL]) {
-               if (pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.spacing == TESS_SPACING_FRACTIONAL_ODD)
-                       vtx_reuse_depth = 14;
-       }
-       radeon_set_context_reg(cmd_buffer->cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
-                              vtx_reuse_depth);
+       radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                              pipeline->graphics.vtx_reuse_depth);
 }
 
 static void
@@ -966,7 +952,7 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
        radv_emit_tess_shaders(cmd_buffer, pipeline);
        radv_emit_geometry_shader(cmd_buffer, pipeline);
        radv_emit_fragment_shader(cmd_buffer, pipeline);
-       polaris_set_vgt_vertex_reuse(cmd_buffer, pipeline);
+       radv_emit_vgt_vertex_reuse(cmd_buffer, pipeline);
 
        cmd_buffer->scratch_size_needed =
                                  MAX2(cmd_buffer->scratch_size_needed,
@@ -1006,6 +992,11 @@ static void
 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
 {
        uint32_t count = cmd_buffer->state.dynamic.scissor.count;
+
+       if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+               cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
+               si_emit_cache_flush(cmd_buffer);
+       }
        si_write_scissors(cmd_buffer->cs, 0, count,
                          cmd_buffer->state.dynamic.scissor.scissors,
                          cmd_buffer->state.dynamic.viewport.viewports,
@@ -1196,7 +1187,7 @@ radv_set_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
                          VkClearDepthStencilValue ds_clear_value,
                          VkImageAspectFlags aspects)
 {
-       uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo);
+       uint64_t va = radv_buffer_get_va(image->bo);
        va += image->offset + image->clear_value_offset;
        unsigned reg_offset = 0, reg_count = 0;
 
@@ -1236,7 +1227,7 @@ static void
 radv_load_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
                           struct radv_image *image)
 {
-       uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo);
+       uint64_t va = radv_buffer_get_va(image->bo);
        va += image->offset + image->clear_value_offset;
 
        if (!image->surface.htile_size)
@@ -1268,7 +1259,7 @@ radv_set_dcc_need_cmask_elim_pred(struct radv_cmd_buffer *cmd_buffer,
                                  bool value)
 {
        uint64_t pred_val = value;
-       uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo);
+       uint64_t va = radv_buffer_get_va(image->bo);
        va += image->offset + image->dcc_pred_offset;
 
        if (!image->surface.dcc_size)
@@ -1292,7 +1283,7 @@ radv_set_color_clear_regs(struct radv_cmd_buffer *cmd_buffer,
                          int idx,
                          uint32_t color_values[2])
 {
-       uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo);
+       uint64_t va = radv_buffer_get_va(image->bo);
        va += image->offset + image->clear_value_offset;
 
        if (!image->cmask.size && !image->surface.dcc_size)
@@ -1319,7 +1310,7 @@ radv_load_color_clear_regs(struct radv_cmd_buffer *cmd_buffer,
                           struct radv_image *image,
                           int idx)
 {
-       uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo);
+       uint64_t va = radv_buffer_get_va(image->bo);
        va += image->offset + image->clear_value_offset;
 
        if (!image->cmask.size && !image->surface.dcc_size)
@@ -1522,7 +1513,7 @@ radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer)
                                         &bo_offset))
                return;
 
-       set->va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
+       set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
        set->va += bo_offset;
 }
 
@@ -1547,7 +1538,7 @@ radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer)
                uptr[1] = set_va >> 32;
        }
 
-       uint64_t va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
+       uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
        va += offset;
 
        if (cmd_buffer->state.pipeline) {
@@ -1598,9 +1589,7 @@ radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
                                                           cmd_buffer->cs,
                                                           MAX_SETS * MESA_SHADER_STAGES * 4);
 
-       for (i = 0; i < MAX_SETS; i++) {
-               if (!(cmd_buffer->state.descriptors_dirty & (1u << i)))
-                       continue;
+       for_each_bit(i, cmd_buffer->state.descriptors_dirty) {
                struct radv_descriptor_set *set = cmd_buffer->state.descriptors[i];
                if (!set)
                        continue;
@@ -1638,7 +1627,7 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
        memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers,
               16 * layout->dynamic_offset_count);
 
-       va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
+       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
        va += offset;
 
        MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
@@ -1710,7 +1699,7 @@ radv_cmd_buffer_update_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer)
                        uint32_t stride = cmd_buffer->state.pipeline->binding_stride[vb];
 
                        device->ws->cs_add_buffer(cmd_buffer->cs, buffer->bo, 8);
-                       va = device->ws->buffer_get_va(buffer->bo);
+                       va = radv_buffer_get_va(buffer->bo);
 
                        offset = cmd_buffer->state.vertex_bindings[vb].offset + velems->offset[i];
                        va += offset + buffer->offset;
@@ -1723,7 +1712,7 @@ radv_cmd_buffer_update_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer)
                        desc[3] = velems->rsrc_word3[i];
                }
 
-               va = device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
+               va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
                va += vb_offset;
 
                radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
@@ -2075,7 +2064,7 @@ static void emit_gfx_buffer_state(struct radv_cmd_buffer *cmd_buffer)
 {
        struct radv_device *device = cmd_buffer->device;
        if (device->gfx_init) {
-               uint64_t va = device->ws->buffer_get_va(device->gfx_init);
+               uint64_t va = radv_buffer_get_va(device->gfx_init);
                device->ws->cs_add_buffer(cmd_buffer->cs, device->gfx_init, 8);
                radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
                radeon_emit(cmd_buffer->cs, va);
@@ -2117,6 +2106,7 @@ VkResult radv_BeginCommandBuffer(
        }
 
        if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
+               assert(pBeginInfo->pInheritanceInfo);
                cmd_buffer->state.framebuffer = radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
                cmd_buffer->state.pass = radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
 
@@ -2166,7 +2156,7 @@ void radv_CmdBindIndexBuffer(
        RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
 
        cmd_buffer->state.index_type = indexType; /* vk matches hw */
-       cmd_buffer->state.index_va = cmd_buffer->device->ws->buffer_get_va(index_buffer->bo);
+       cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
        cmd_buffer->state.index_va += index_buffer->offset + offset;
 
        int index_size_shift = cmd_buffer->state.index_type ? 2 : 1;
@@ -2288,7 +2278,7 @@ void radv_meta_push_descriptor_set(
                                          (void**) &push_set->mapped_ptr))
                return;
 
-       push_set->va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
+       push_set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
        push_set->va += bo_offset;
 
        radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
@@ -2392,7 +2382,7 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
        cmd_buffer->state.emitted_compute_pipeline = pipeline;
 
        compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
-       va = ws->buffer_get_va(compute_shader->bo) + compute_shader->bo_offset;
+       va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset;
 
        ws->cs_add_buffer(cmd_buffer->cs, compute_shader->bo, 8);
        radv_emit_prefetch(cmd_buffer, va, compute_shader->code_size);
@@ -2446,14 +2436,20 @@ void radv_CmdBindPipeline(
        RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
        RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
 
-       radv_mark_descriptor_sets_dirty(cmd_buffer);
-
        switch (pipelineBindPoint) {
        case VK_PIPELINE_BIND_POINT_COMPUTE:
+               if (cmd_buffer->state.compute_pipeline == pipeline)
+                       return;
+               radv_mark_descriptor_sets_dirty(cmd_buffer);
+
                cmd_buffer->state.compute_pipeline = pipeline;
                cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
                break;
        case VK_PIPELINE_BIND_POINT_GRAPHICS:
+               if (cmd_buffer->state.pipeline == pipeline)
+                       return;
+               radv_mark_descriptor_sets_dirty(cmd_buffer);
+
                cmd_buffer->state.pipeline = pipeline;
                if (!pipeline)
                        break;
@@ -2497,10 +2493,10 @@ void radv_CmdSetViewport(
        const VkViewport*                           pViewports)
 {
        RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-
        const uint32_t total_count = firstViewport + viewportCount;
-       if (cmd_buffer->state.dynamic.viewport.count < total_count)
-               cmd_buffer->state.dynamic.viewport.count = total_count;
+
+       assert(firstViewport < MAX_VIEWPORTS);
+       assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
 
        memcpy(cmd_buffer->state.dynamic.viewport.viewports + firstViewport,
               pViewports, viewportCount * sizeof(*pViewports));
@@ -2515,10 +2511,10 @@ void radv_CmdSetScissor(
        const VkRect2D*                             pScissors)
 {
        RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-
        const uint32_t total_count = firstScissor + scissorCount;
-       if (cmd_buffer->state.dynamic.scissor.count < total_count)
-               cmd_buffer->state.dynamic.scissor.count = total_count;
+
+       assert(firstScissor < MAX_SCISSORS);
+       assert(total_count >= 1 && total_count <= MAX_SCISSORS);
 
        memcpy(cmd_buffer->state.dynamic.scissor.scissors + firstScissor,
               pScissors, scissorCount * sizeof(*pScissors));
@@ -2653,16 +2649,17 @@ void radv_CmdExecuteCommands(
                                assert(secondary->ring_offsets_idx == primary->ring_offsets_idx);
                }
                primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
+
+               primary->state.emitted_pipeline = secondary->state.emitted_pipeline;
+               primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
+               primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en;
+               primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index;
        }
 
-       /* if we execute secondary we need to re-emit out pipelines */
+       /* if we execute secondary we need to mark some stuff to reset dirty */
        if (commandBufferCount) {
-               primary->state.emitted_pipeline = NULL;
-               primary->state.emitted_compute_pipeline = NULL;
                primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
                primary->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_ALL;
-               primary->state.last_primitive_reset_en = -1;
-               primary->state.last_primitive_reset_index = 0;
                radv_mark_descriptor_sets_dirty(primary);
        }
 }
@@ -2988,13 +2985,15 @@ radv_emit_indirect_draw(struct radv_cmd_buffer *cmd_buffer,
        RADV_FROM_HANDLE(radv_buffer, count_buffer, _count_buffer);
        struct radeon_winsys_cs *cs = cmd_buffer->cs;
 
-       uint64_t indirect_va = cmd_buffer->device->ws->buffer_get_va(buffer->bo);
+       uint64_t indirect_va = radv_buffer_get_va(buffer->bo);
        indirect_va += offset + buffer->offset;
        uint64_t count_va = 0;
 
        if (count_buffer) {
-               count_va = cmd_buffer->device->ws->buffer_get_va(count_buffer->bo);
+               count_va = radv_buffer_get_va(count_buffer->bo);
                count_va += count_offset + count_buffer->offset;
+
+               cmd_buffer->device->ws->cs_add_buffer(cs, count_buffer->bo, 8);
        }
 
        if (!draw_count)
@@ -3059,8 +3058,14 @@ radv_cmd_draw_indexed_indirect_count(
 
        MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 31 * MAX_VIEWS);
 
-       radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
-       radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type);
+       if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+               radeon_set_uconfig_reg_idx(cmd_buffer->cs,
+                                          R_03090C_VGT_INDEX_TYPE,
+                                          2, cmd_buffer->state.index_type);
+       } else {
+               radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
+               radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type);
+       }
 
        radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_BASE, 1, 0));
        radeon_emit(cmd_buffer->cs, index_va);
@@ -3125,14 +3130,163 @@ void radv_CmdDrawIndexedIndirectCountAMD(
                                             maxDrawCount, stride);
 }
 
+struct radv_dispatch_info {
+       /**
+        * Determine the layout of the grid (in block units) to be used.
+        */
+       uint32_t blocks[3];
+
+       /**
+        * Whether it's an unaligned compute dispatch.
+        */
+       bool unaligned;
+
+       /**
+        * Indirect compute parameters resource.
+        */
+       struct radv_buffer *indirect;
+       uint64_t indirect_offset;
+};
+
+static void
+radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
+                          const struct radv_dispatch_info *info)
+{
+       struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+       struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
+       struct radeon_winsys *ws = cmd_buffer->device->ws;
+       struct radeon_winsys_cs *cs = cmd_buffer->cs;
+       struct ac_userdata_info *loc;
+       uint8_t grid_used;
+
+       grid_used = compute_shader->info.info.cs.grid_components_used;
+
+       loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE,
+                                   AC_UD_CS_GRID_SIZE);
+
+       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(ws, cs, 25);
+
+       if (info->indirect) {
+               uint64_t va = radv_buffer_get_va(info->indirect->bo);
+
+               va += info->indirect->offset + info->indirect_offset;
+
+               ws->cs_add_buffer(cs, info->indirect->bo, 8);
+
+               if (loc->sgpr_idx != -1) {
+                       for (unsigned i = 0; i < grid_used; ++i) {
+                               radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+                               radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
+                                               COPY_DATA_DST_SEL(COPY_DATA_REG));
+                               radeon_emit(cs, (va +  4 * i));
+                               radeon_emit(cs, (va + 4 * i) >> 32);
+                               radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0
+                                                + loc->sgpr_idx * 4) >> 2) + i);
+                               radeon_emit(cs, 0);
+                       }
+               }
+
+               if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
+                       radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) |
+                                       PKT3_SHADER_TYPE_S(1));
+                       radeon_emit(cs, va);
+                       radeon_emit(cs, va >> 32);
+                       radeon_emit(cs, 1);
+               } else {
+                       radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
+                                       PKT3_SHADER_TYPE_S(1));
+                       radeon_emit(cs, 1);
+                       radeon_emit(cs, va);
+                       radeon_emit(cs, va >> 32);
+
+                       radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) |
+                                       PKT3_SHADER_TYPE_S(1));
+                       radeon_emit(cs, 0);
+                       radeon_emit(cs, 1);
+               }
+       } else {
+               unsigned blocks[3] = { info->blocks[0], info->blocks[1], info->blocks[2] };
+               unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) |
+                                             S_00B800_FORCE_START_AT_000(1);
+
+               if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
+                       /* If the KMD allows it (there is a KMD hw register for
+                        * it), allow launching waves out-of-order.
+                        */
+                       dispatch_initiator |= S_00B800_ORDER_MODE(1);
+               }
+
+               if (info->unaligned) {
+                       unsigned *cs_block_size = compute_shader->info.cs.block_size;
+                       unsigned remainder[3];
+
+                       /* If aligned, these should be an entire block size,
+                        * not 0.
+                        */
+                       remainder[0] = blocks[0] + cs_block_size[0] -
+                                      align_u32_npot(blocks[0], cs_block_size[0]);
+                       remainder[1] = blocks[1] + cs_block_size[1] -
+                                      align_u32_npot(blocks[1], cs_block_size[1]);
+                       remainder[2] = blocks[2] + cs_block_size[2] -
+                                      align_u32_npot(blocks[2], cs_block_size[2]);
+
+                       blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
+                       blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
+                       blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
+
+                       radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+                       radeon_emit(cs,
+                                   S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
+                                   S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
+                       radeon_emit(cs,
+                                   S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
+                                   S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
+                       radeon_emit(cs,
+                                   S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
+                                   S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
+
+                       dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
+               }
+
+               if (loc->sgpr_idx != -1) {
+                       assert(!loc->indirect);
+                       assert(loc->num_sgprs == grid_used);
+
+                       radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+                                                 loc->sgpr_idx * 4, grid_used);
+                       radeon_emit(cs, blocks[0]);
+                       if (grid_used > 1)
+                               radeon_emit(cs, blocks[1]);
+                       if (grid_used > 2)
+                               radeon_emit(cs, blocks[2]);
+               }
+
+               radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
+                               PKT3_SHADER_TYPE_S(1));
+               radeon_emit(cs, blocks[0]);
+               radeon_emit(cs, blocks[1]);
+               radeon_emit(cs, blocks[2]);
+               radeon_emit(cs, dispatch_initiator);
+       }
+
+       assert(cmd_buffer->cs->cdw <= cdw_max);
+}
+
 static void
-radv_flush_compute_state(struct radv_cmd_buffer *cmd_buffer)
+radv_dispatch(struct radv_cmd_buffer *cmd_buffer,
+             const struct radv_dispatch_info *info)
 {
        radv_emit_compute_pipeline(cmd_buffer);
+
        radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
        radv_flush_constants(cmd_buffer, cmd_buffer->state.compute_pipeline,
                             VK_SHADER_STAGE_COMPUTE_BIT);
+
        si_emit_cache_flush(cmd_buffer);
+
+       radv_emit_dispatch_packets(cmd_buffer, info);
+
+       radv_cmd_buffer_after_draw(cmd_buffer);
 }
 
 void radv_CmdDispatch(
@@ -3142,34 +3296,13 @@ void radv_CmdDispatch(
        uint32_t                                    z)
 {
        RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+       struct radv_dispatch_info info = {};
 
-       radv_flush_compute_state(cmd_buffer);
-
-       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 10);
-
-       struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
-                                                            MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
-       if (loc->sgpr_idx != -1) {
-               assert(!loc->indirect);
-               uint8_t grid_used = cmd_buffer->state.compute_pipeline->shaders[MESA_SHADER_COMPUTE]->info.info.cs.grid_components_used;
-               assert(loc->num_sgprs == grid_used);
-               radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, grid_used);
-               radeon_emit(cmd_buffer->cs, x);
-               if (grid_used > 1)
-                       radeon_emit(cmd_buffer->cs, y);
-               if (grid_used > 2)
-                       radeon_emit(cmd_buffer->cs, z);
-       }
-
-       radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
-                   PKT3_SHADER_TYPE_S(1));
-       radeon_emit(cmd_buffer->cs, x);
-       radeon_emit(cmd_buffer->cs, y);
-       radeon_emit(cmd_buffer->cs, z);
-       radeon_emit(cmd_buffer->cs, 1);
+       info.blocks[0] = x;
+       info.blocks[1] = y;
+       info.blocks[2] = z;
 
-       assert(cmd_buffer->cs->cdw <= cdw_max);
-       radv_cmd_buffer_after_draw(cmd_buffer);
+       radv_dispatch(cmd_buffer, &info);
 }
 
 void radv_CmdDispatchIndirect(
@@ -3179,50 +3312,12 @@ void radv_CmdDispatchIndirect(
 {
        RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
        RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
-       uint64_t va = cmd_buffer->device->ws->buffer_get_va(buffer->bo);
-       va += buffer->offset + offset;
-
-       cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, buffer->bo, 8);
-
-       radv_flush_compute_state(cmd_buffer);
-
-       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 25);
-       struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
-                                                            MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
-       if (loc->sgpr_idx != -1) {
-               uint8_t grid_used = cmd_buffer->state.compute_pipeline->shaders[MESA_SHADER_COMPUTE]->info.info.cs.grid_components_used;
-               for (unsigned i = 0; i < grid_used; ++i) {
-                       radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0));
-                       radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
-                                   COPY_DATA_DST_SEL(COPY_DATA_REG));
-                       radeon_emit(cmd_buffer->cs, (va +  4 * i));
-                       radeon_emit(cmd_buffer->cs, (va + 4 * i) >> 32);
-                       radeon_emit(cmd_buffer->cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i);
-                       radeon_emit(cmd_buffer->cs, 0);
-               }
-       }
-
-       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
-               radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) |
-                                       PKT3_SHADER_TYPE_S(1));
-               radeon_emit(cmd_buffer->cs, va);
-               radeon_emit(cmd_buffer->cs, va >> 32);
-               radeon_emit(cmd_buffer->cs, 1);
-       } else {
-               radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_BASE, 2, 0) |
-                                       PKT3_SHADER_TYPE_S(1));
-               radeon_emit(cmd_buffer->cs, 1);
-               radeon_emit(cmd_buffer->cs, va);
-               radeon_emit(cmd_buffer->cs, va >> 32);
+       struct radv_dispatch_info info = {};
 
-               radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) |
-                                       PKT3_SHADER_TYPE_S(1));
-               radeon_emit(cmd_buffer->cs, 0);
-               radeon_emit(cmd_buffer->cs, 1);
-       }
+       info.indirect = buffer;
+       info.indirect_offset = offset;
 
-       assert(cmd_buffer->cs->cdw <= cdw_max);
-       radv_cmd_buffer_after_draw(cmd_buffer);
+       radv_dispatch(cmd_buffer, &info);
 }
 
 void radv_unaligned_dispatch(
@@ -3231,55 +3326,14 @@ void radv_unaligned_dispatch(
        uint32_t                                    y,
        uint32_t                                    z)
 {
-       struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
-       struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
-       uint32_t blocks[3], remainder[3];
+       struct radv_dispatch_info info = {};
 
-       blocks[0] = round_up_u32(x, compute_shader->info.cs.block_size[0]);
-       blocks[1] = round_up_u32(y, compute_shader->info.cs.block_size[1]);
-       blocks[2] = round_up_u32(z, compute_shader->info.cs.block_size[2]);
+       info.blocks[0] = x;
+       info.blocks[1] = y;
+       info.blocks[2] = z;
+       info.unaligned = 1;
 
-       /* If aligned, these should be an entire block size, not 0 */
-       remainder[0] = x + compute_shader->info.cs.block_size[0] - align_u32_npot(x, compute_shader->info.cs.block_size[0]);
-       remainder[1] = y + compute_shader->info.cs.block_size[1] - align_u32_npot(y, compute_shader->info.cs.block_size[1]);
-       remainder[2] = z + compute_shader->info.cs.block_size[2] - align_u32_npot(z, compute_shader->info.cs.block_size[2]);
-
-       radv_flush_compute_state(cmd_buffer);
-
-       MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15);
-
-       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-       radeon_emit(cmd_buffer->cs,
-                   S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]) |
-                   S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
-       radeon_emit(cmd_buffer->cs,
-                   S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]) |
-                   S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
-       radeon_emit(cmd_buffer->cs,
-                   S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]) |
-                   S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
-
-       struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
-                                                            MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
-       if (loc->sgpr_idx != -1) {
-               uint8_t grid_used = cmd_buffer->state.compute_pipeline->shaders[MESA_SHADER_COMPUTE]->info.info.cs.grid_components_used;
-               radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, grid_used);
-               radeon_emit(cmd_buffer->cs, blocks[0]);
-               if (grid_used > 1)
-                       radeon_emit(cmd_buffer->cs, blocks[1]);
-               if (grid_used > 2)
-                       radeon_emit(cmd_buffer->cs, blocks[2]);
-       }
-       radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
-                   PKT3_SHADER_TYPE_S(1));
-       radeon_emit(cmd_buffer->cs, blocks[0]);
-       radeon_emit(cmd_buffer->cs, blocks[1]);
-       radeon_emit(cmd_buffer->cs, blocks[2]);
-       radeon_emit(cmd_buffer->cs, S_00B800_COMPUTE_SHADER_EN(1) |
-                                   S_00B800_PARTIAL_TG_EN(1));
-
-       assert(cmd_buffer->cs->cdw <= cdw_max);
-       radv_cmd_buffer_after_draw(cmd_buffer);
+       radv_dispatch(cmd_buffer, &info);
 }
 
 void radv_CmdEndRenderPass(
@@ -3395,8 +3449,7 @@ static void radv_handle_cmask_image_transition(struct radv_cmd_buffer *cmd_buffe
                                               VkImageLayout dst_layout,
                                               unsigned src_queue_mask,
                                               unsigned dst_queue_mask,
-                                              const VkImageSubresourceRange *range,
-                                              VkImageAspectFlags pending_clears)
+                                              const VkImageSubresourceRange *range)
 {
        if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
                if (image->fmask.size)
@@ -3432,8 +3485,7 @@ static void radv_handle_dcc_image_transition(struct radv_cmd_buffer *cmd_buffer,
                                             VkImageLayout dst_layout,
                                             unsigned src_queue_mask,
                                             unsigned dst_queue_mask,
-                                            const VkImageSubresourceRange *range,
-                                            VkImageAspectFlags pending_clears)
+                                            const VkImageSubresourceRange *range)
 {
        if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
                radv_initialize_dcc(cmd_buffer, image, 0x20202020u);
@@ -3481,14 +3533,12 @@ static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
        if (image->cmask.size)
                radv_handle_cmask_image_transition(cmd_buffer, image, src_layout,
                                                   dst_layout, src_queue_mask,
-                                                  dst_queue_mask, range,
-                                                  pending_clears);
+                                                  dst_queue_mask, range);
 
        if (image->surface.dcc_size)
                radv_handle_dcc_image_transition(cmd_buffer, image, src_layout,
                                                 dst_layout, src_queue_mask,
-                                                dst_queue_mask, range,
-                                                pending_clears);
+                                                dst_queue_mask, range);
 }
 
 void radv_CmdPipelineBarrier(
@@ -3550,7 +3600,7 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer,
                        unsigned value)
 {
        struct radeon_winsys_cs *cs = cmd_buffer->cs;
-       uint64_t va = cmd_buffer->device->ws->buffer_get_va(event->bo);
+       uint64_t va = radv_buffer_get_va(event->bo);
 
        cmd_buffer->device->ws->cs_add_buffer(cs, event->bo, 8);
 
@@ -3563,7 +3613,7 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer,
                                   cmd_buffer->state.predicating,
                                   cmd_buffer->device->physical_device->rad_info.chip_class,
                                   false,
-                                  EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0,
+                                  V_028A90_BOTTOM_OF_PIPE_TS, 0,
                                   1, va, 2, value);
 
        assert(cmd_buffer->cs->cdw <= cdw_max);
@@ -3606,7 +3656,7 @@ void radv_CmdWaitEvents(VkCommandBuffer commandBuffer,
 
        for (unsigned i = 0; i < eventCount; ++i) {
                RADV_FROM_HANDLE(radv_event, event, pEvents[i]);
-               uint64_t va = cmd_buffer->device->ws->buffer_get_va(event->bo);
+               uint64_t va = radv_buffer_get_va(event->bo);
 
                cmd_buffer->device->ws->cs_add_buffer(cs, event->bo, 8);