X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_cmd_buffer.c;h=992dc123a21a5380c117f364851b081c1bbc91e1;hb=d0d6a611d990c16136c2f27aeec192f37729fa0b;hp=8f2e98476e506b96842df05b394ee7060afb51ca;hpb=592069c1fbccf55e26d2822337dfab40edf6948e;p=mesa.git diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 8f2e98476e5..4db9d7628c2 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -27,9 +27,12 @@ #include "radv_private.h" #include "radv_radeon_winsys.h" +#include "radv_shader.h" #include "radv_cs.h" #include "sid.h" +#include "gfx9d.h" #include "vk_format.h" +#include "radv_debug.h" #include "radv_meta.h" #include "ac_debug.h" @@ -40,7 +43,7 @@ static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, VkImageLayout dst_layout, uint32_t src_family, uint32_t dst_family, - VkImageSubresourceRange range, + const VkImageSubresourceRange *range, VkImageAspectFlags pending_clears); const struct radv_dynamic_state default_dynamic_state = { @@ -80,14 +83,18 @@ radv_dynamic_state_copy(struct radv_dynamic_state *dest, const struct radv_dynamic_state *src, uint32_t copy_mask) { + /* Make sure to copy the number of viewports/scissors because they can + * only be specified at pipeline creation time. + */ + dest->viewport.count = src->viewport.count; + dest->scissor.count = src->scissor.count; + if (copy_mask & (1 << VK_DYNAMIC_STATE_VIEWPORT)) { - dest->viewport.count = src->viewport.count; typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count); } if (copy_mask & (1 << VK_DYNAMIC_STATE_SCISSOR)) { - dest->scissor.count = src->scissor.count; typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count); } @@ -140,7 +147,6 @@ static VkResult radv_create_cmd_buffer( VkCommandBuffer* pCommandBuffer) { struct radv_cmd_buffer *cmd_buffer; - VkResult result; unsigned ring; cmd_buffer = vk_alloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); @@ -169,8 +175,8 @@ static VkResult radv_create_cmd_buffer( cmd_buffer->cs = device->ws->cs_create(device->ws, ring); if (!cmd_buffer->cs) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail; + vk_free(&cmd_buffer->pool->alloc, cmd_buffer); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer); @@ -180,11 +186,66 @@ static VkResult radv_create_cmd_buffer( list_inithead(&cmd_buffer->upload.list); return VK_SUCCESS; +} + +static void +radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer) +{ + list_del(&cmd_buffer->pool_link); + + list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, + &cmd_buffer->upload.list, list) { + cmd_buffer->device->ws->buffer_destroy(up->upload_bo); + list_del(&up->list); + free(up); + } -fail: + if (cmd_buffer->upload.upload_bo) + cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo); + cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs); + free(cmd_buffer->push_descriptors.set.mapped_ptr); vk_free(&cmd_buffer->pool->alloc, cmd_buffer); +} - return result; +static VkResult +radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) +{ + + cmd_buffer->device->ws->cs_reset(cmd_buffer->cs); + + list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, + &cmd_buffer->upload.list, list) { + cmd_buffer->device->ws->buffer_destroy(up->upload_bo); + list_del(&up->list); + free(up); + } + + cmd_buffer->push_constant_stages = 0; + cmd_buffer->scratch_size_needed = 0; + cmd_buffer->compute_scratch_size_needed = 0; + cmd_buffer->esgs_ring_size_needed = 0; + cmd_buffer->gsvs_ring_size_needed = 0; + cmd_buffer->tess_rings_needed = false; + cmd_buffer->sample_positions_needed = false; + + if (cmd_buffer->upload.upload_bo) + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, + cmd_buffer->upload.upload_bo, 8); + cmd_buffer->upload.offset = 0; + + cmd_buffer->record_result = VK_SUCCESS; + + cmd_buffer->ring_offsets_idx = -1; + + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + void *fence_ptr; + radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 0, + &cmd_buffer->gfx9_fence_offset, + &fence_ptr); + cmd_buffer->gfx9_fence_bo = cmd_buffer->upload.upload_bo; + } + + return cmd_buffer->record_result; } static bool @@ -205,7 +266,7 @@ radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, RADEON_FLAG_CPU_ACCESS); if (!bo) { - cmd_buffer->record_fail = true; + cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; return false; } @@ -214,7 +275,7 @@ radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, upload = malloc(sizeof(*upload)); if (!upload) { - cmd_buffer->record_fail = true; + cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; device->ws->buffer_destroy(bo); return false; } @@ -229,7 +290,7 @@ radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo); if (!cmd_buffer->upload.map) { - cmd_buffer->record_fail = true; + cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; return false; } @@ -274,6 +335,19 @@ radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, return true; } +static void +radv_emit_write_data_packet(struct radeon_winsys_cs *cs, uint64_t va, + unsigned count, const uint32_t *data) +{ + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_ME)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit_array(cs, data, count); +} + void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer) { struct radv_device *device = cmd_buffer->device; @@ -283,23 +357,103 @@ void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer) if (!device->trace_bo) return; - va = device->ws->buffer_get_va(device->trace_bo); + va = radv_buffer_get_va(device->trace_bo); + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) + va += 4; MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 7); ++cmd_buffer->state.trace_id; device->ws->cs_add_buffer(cs, device->trace_bo, 8); - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); - radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) | - S_370_WR_CONFIRM(1) | - S_370_ENGINE_SEL(V_370_ME)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit(cs, cmd_buffer->state.trace_id); + radv_emit_write_data_packet(cs, va, 1, &cmd_buffer->state.trace_id); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id)); } +static void +radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer) +{ + if (cmd_buffer->device->debug_flags & RADV_DEBUG_SYNC_SHADERS) { + enum radv_cmd_flush_bits flags; + + /* Force wait for graphics/compute engines to be idle. */ + flags = RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_CS_PARTIAL_FLUSH; + + si_cs_emit_cache_flush(cmd_buffer->cs, false, + cmd_buffer->device->physical_device->rad_info.chip_class, + NULL, 0, + radv_cmd_buffer_uses_mec(cmd_buffer), + flags); + } + + radv_cmd_buffer_trace_emit(cmd_buffer); +} + +static void +radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline, enum ring_type ring) +{ + struct radv_device *device = cmd_buffer->device; + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint32_t data[2]; + uint64_t va; + + if (!device->trace_bo) + return; + + va = radv_buffer_get_va(device->trace_bo); + + switch (ring) { + case RING_GFX: + va += 8; + break; + case RING_COMPUTE: + va += 16; + break; + default: + assert(!"invalid ring type"); + } + + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(device->ws, + cmd_buffer->cs, 6); + + data[0] = (uintptr_t)pipeline; + data[1] = (uintptr_t)pipeline >> 32; + + device->ws->cs_add_buffer(cs, device->trace_bo, 8); + radv_emit_write_data_packet(cs, va, 2, data); +} + +static void +radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_device *device = cmd_buffer->device; + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint32_t data[MAX_SETS * 2] = {}; + uint64_t va; + + if (!device->trace_bo) + return; + + va = radv_buffer_get_va(device->trace_bo) + 24; + + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(device->ws, + cmd_buffer->cs, 4 + MAX_SETS * 2); + + for (int i = 0; i < MAX_SETS; i++) { + struct radv_descriptor_set *set = cmd_buffer->state.descriptors[i]; + if (!set) + continue; + + data[i * 2] = (uintptr_t)set; + data[i * 2 + 1] = (uintptr_t)set >> 32; + } + + device->ws->cs_add_buffer(cs, device->trace_bo, 8); + radv_emit_write_data_packet(cs, va, MAX_SETS * 2, data); +} + static void radv_emit_graphics_blend_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) @@ -309,6 +463,17 @@ radv_emit_graphics_blend_state(struct radv_cmd_buffer *cmd_buffer, 8); radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, pipeline->graphics.blend.cb_color_control); radeon_set_context_reg(cmd_buffer->cs, R_028B70_DB_ALPHA_TO_MASK, pipeline->graphics.blend.db_alpha_to_mask); + + if (cmd_buffer->device->physical_device->has_rbplus) { + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028760_SX_MRT0_BLEND_OPT, 8); + radeon_emit_array(cmd_buffer->cs, pipeline->graphics.blend.sx_mrt_blend_opt, 8); + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3); + radeon_emit(cmd_buffer->cs, 0); /* R_028754_SX_PS_DOWNCONVERT */ + radeon_emit(cmd_buffer->cs, 0); /* R_028758_SX_BLEND_OPT_EPSILON */ + radeon_emit(cmd_buffer->cs, 0); /* R_02875C_SX_BLEND_OPT_CONTROL */ + } } static void @@ -330,24 +495,7 @@ static unsigned radv_pack_float_12p4(float x) x >= 4096 ? 0xffff : x * 16; } -static uint32_t -shader_stage_to_user_data_0(gl_shader_stage stage, bool has_gs) -{ - switch (stage) { - case MESA_SHADER_FRAGMENT: - return R_00B030_SPI_SHADER_USER_DATA_PS_0; - case MESA_SHADER_VERTEX: - return has_gs ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : R_00B130_SPI_SHADER_USER_DATA_VS_0; - case MESA_SHADER_GEOMETRY: - return R_00B230_SPI_SHADER_USER_DATA_GS_0; - case MESA_SHADER_COMPUTE: - return R_00B900_COMPUTE_USER_DATA_0; - default: - unreachable("unknown shader"); - } -} - -static struct ac_userdata_info * +struct ac_userdata_info * radv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx) @@ -362,7 +510,7 @@ radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer, int idx, uint64_t va) { struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx); - uint32_t base_reg = shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline)); + uint32_t base_reg = radv_shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); if (loc->sgpr_idx == -1) return; assert(loc->num_sgprs == 2); @@ -396,37 +544,40 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples); - uint32_t samples_offset; - void *samples_ptr; - void *src; - radv_cmd_buffer_upload_alloc(cmd_buffer, num_samples * 4 * 2, 256, &samples_offset, - &samples_ptr); - switch (num_samples) { - case 1: - src = cmd_buffer->device->sample_locations_1x; - break; - case 2: - src = cmd_buffer->device->sample_locations_2x; - break; - case 4: - src = cmd_buffer->device->sample_locations_4x; - break; - case 8: - src = cmd_buffer->device->sample_locations_8x; - break; - case 16: - src = cmd_buffer->device->sample_locations_16x; - break; - default: - unreachable("unknown number of samples"); + /* GFX9: Flush DFSM when the AA mode changes. */ + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); } - memcpy(samples_ptr, src, num_samples * 4 * 2); - - uint64_t va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); - va += samples_offset; + if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.needs_sample_positions) { + uint32_t offset; + struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_FRAGMENT, AC_UD_PS_SAMPLE_POS_OFFSET); + uint32_t base_reg = radv_shader_stage_to_user_data_0(MESA_SHADER_FRAGMENT, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + if (loc->sgpr_idx == -1) + return; + assert(loc->num_sgprs == 1); + assert(!loc->indirect); + switch (num_samples) { + default: + offset = 0; + break; + case 2: + offset = 1; + break; + case 4: + offset = 3; + break; + case 8: + offset = 7; + break; + case 16: + offset = 15; + break; + } - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT, - AC_UD_PS_SAMPLE_POS, va); + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, offset); + cmd_buffer->sample_positions_needed = true; + } } static void @@ -454,30 +605,40 @@ radv_emit_graphics_raster_state(struct radv_cmd_buffer *cmd_buffer, raster->pa_su_sc_mode_cntl); } +static inline void +radv_emit_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va, + unsigned size) +{ + if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) + si_cp_dma_prefetch(cmd_buffer, va, size); +} + static void radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, - struct radv_shader_variant *shader) + struct radv_shader_variant *shader, + struct ac_vs_output_info *outinfo) { struct radeon_winsys *ws = cmd_buffer->device->ws; - uint64_t va = ws->buffer_get_va(shader->bo); + uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset; unsigned export_count; ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8); + radv_emit_prefetch(cmd_buffer, va, shader->code_size); - export_count = MAX2(1, shader->info.vs.param_exports); + export_count = MAX2(1, outinfo->param_exports); radeon_set_context_reg(cmd_buffer->cs, R_0286C4_SPI_VS_OUT_CONFIG, S_0286C4_VS_EXPORT_COUNT(export_count - 1)); radeon_set_context_reg(cmd_buffer->cs, R_02870C_SPI_SHADER_POS_FORMAT, S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | - S_02870C_POS1_EXPORT_FORMAT(shader->info.vs.pos_exports > 1 ? + S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE) | - S_02870C_POS2_EXPORT_FORMAT(shader->info.vs.pos_exports > 2 ? + S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE) | - S_02870C_POS3_EXPORT_FORMAT(shader->info.vs.pos_exports > 3 ? + S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE)); @@ -494,39 +655,28 @@ radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer, S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1)); - unsigned clip_dist_mask, cull_dist_mask, total_mask; - clip_dist_mask = shader->info.vs.clip_dist_mask; - cull_dist_mask = shader->info.vs.cull_dist_mask; - total_mask = clip_dist_mask | cull_dist_mask; radeon_set_context_reg(cmd_buffer->cs, R_02881C_PA_CL_VS_OUT_CNTL, - S_02881C_USE_VTX_POINT_SIZE(shader->info.vs.writes_pointsize) | - S_02881C_USE_VTX_RENDER_TARGET_INDX(shader->info.vs.writes_layer) | - S_02881C_USE_VTX_VIEWPORT_INDX(shader->info.vs.writes_viewport_index) | - S_02881C_VS_OUT_MISC_VEC_ENA(shader->info.vs.writes_pointsize || - shader->info.vs.writes_layer || - shader->info.vs.writes_viewport_index) | - S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) | - S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) | - pipeline->graphics.raster.pa_cl_vs_out_cntl | - cull_dist_mask << 8 | - clip_dist_mask); - - radeon_set_context_reg(cmd_buffer->cs, R_028AB4_VGT_REUSE_OFF, - S_028AB4_REUSE_OFF(shader->info.vs.writes_viewport_index)); + pipeline->graphics.pa_cl_vs_out_cntl); + + if (cmd_buffer->device->physical_device->rad_info.chip_class <= VI) + radeon_set_context_reg(cmd_buffer->cs, R_028AB4_VGT_REUSE_OFF, + S_028AB4_REUSE_OFF(outinfo->writes_viewport_index)); } static void radv_emit_hw_es(struct radv_cmd_buffer *cmd_buffer, - struct radv_shader_variant *shader) + struct radv_shader_variant *shader, + struct ac_es_output_info *outinfo) { struct radeon_winsys *ws = cmd_buffer->device->ws; - uint64_t va = ws->buffer_get_va(shader->bo); + uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset; ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8); + radv_emit_prefetch(cmd_buffer, va, shader->code_size); radeon_set_context_reg(cmd_buffer->cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE, - shader->info.vs.esgs_itemsize / 4); + outinfo->esgs_itemsize / 4); radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B320_SPI_SHADER_PGM_LO_ES, 4); radeon_emit(cmd_buffer->cs, va >> 8); radeon_emit(cmd_buffer->cs, va >> 40); @@ -534,6 +684,48 @@ radv_emit_hw_es(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cmd_buffer->cs, shader->rsrc2); } +static void +radv_emit_hw_ls(struct radv_cmd_buffer *cmd_buffer, + struct radv_shader_variant *shader) +{ + struct radeon_winsys *ws = cmd_buffer->device->ws; + uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset; + uint32_t rsrc2 = shader->rsrc2; + + ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8); + radv_emit_prefetch(cmd_buffer, va, shader->code_size); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2); + radeon_emit(cmd_buffer->cs, va >> 8); + radeon_emit(cmd_buffer->cs, va >> 40); + + rsrc2 |= S_00B52C_LDS_SIZE(cmd_buffer->state.pipeline->graphics.tess.lds_size); + if (cmd_buffer->device->physical_device->rad_info.chip_class == CIK && + cmd_buffer->device->physical_device->rad_info.family != CHIP_HAWAII) + radeon_set_sh_reg(cmd_buffer->cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, rsrc2); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2); + radeon_emit(cmd_buffer->cs, shader->rsrc1); + radeon_emit(cmd_buffer->cs, rsrc2); +} + +static void +radv_emit_hw_hs(struct radv_cmd_buffer *cmd_buffer, + struct radv_shader_variant *shader) +{ + struct radeon_winsys *ws = cmd_buffer->device->ws; + uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset; + + ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8); + radv_emit_prefetch(cmd_buffer, va, shader->code_size); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B420_SPI_SHADER_PGM_LO_HS, 4); + radeon_emit(cmd_buffer->cs, va >> 8); + radeon_emit(cmd_buffer->cs, va >> 40); + radeon_emit(cmd_buffer->cs, shader->rsrc1); + radeon_emit(cmd_buffer->cs, shader->rsrc2); +} + static void radv_emit_vertex_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) @@ -544,34 +736,80 @@ radv_emit_vertex_shader(struct radv_cmd_buffer *cmd_buffer, vs = pipeline->shaders[MESA_SHADER_VERTEX]; - if (vs->info.vs.as_es) - radv_emit_hw_es(cmd_buffer, vs); + if (vs->info.vs.as_ls) + radv_emit_hw_ls(cmd_buffer, vs); + else if (vs->info.vs.as_es) + radv_emit_hw_es(cmd_buffer, vs, &vs->info.vs.es_info); else - radv_emit_hw_vs(cmd_buffer, pipeline, vs); + radv_emit_hw_vs(cmd_buffer, pipeline, vs, &vs->info.vs.outinfo); - radeon_set_context_reg(cmd_buffer->cs, R_028A84_VGT_PRIMITIVEID_EN, 0); + radeon_set_context_reg(cmd_buffer->cs, R_028A84_VGT_PRIMITIVEID_EN, pipeline->graphics.vgt_primitiveid_en); } -static uint32_t si_vgt_gs_mode(struct radv_shader_variant *gs) + +static void +radv_emit_tess_shaders(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline) { - unsigned gs_max_vert_out = gs->info.gs.vertices_out; - unsigned cut_mode; + if (!radv_pipeline_has_tess(pipeline)) + return; - if (gs_max_vert_out <= 128) { - cut_mode = V_028A40_GS_CUT_128; - } else if (gs_max_vert_out <= 256) { - cut_mode = V_028A40_GS_CUT_256; - } else if (gs_max_vert_out <= 512) { - cut_mode = V_028A40_GS_CUT_512; - } else { - assert(gs_max_vert_out <= 1024); - cut_mode = V_028A40_GS_CUT_1024; + struct radv_shader_variant *tes, *tcs; + + tcs = pipeline->shaders[MESA_SHADER_TESS_CTRL]; + tes = pipeline->shaders[MESA_SHADER_TESS_EVAL]; + + if (tes->info.tes.as_es) + radv_emit_hw_es(cmd_buffer, tes, &tes->info.tes.es_info); + else + radv_emit_hw_vs(cmd_buffer, pipeline, tes, &tes->info.tes.outinfo); + + radv_emit_hw_hs(cmd_buffer, tcs); + + radeon_set_context_reg(cmd_buffer->cs, R_028B6C_VGT_TF_PARAM, + pipeline->graphics.tess.tf_param); + + if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) + radeon_set_context_reg_idx(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, 2, + pipeline->graphics.tess.ls_hs_config); + else + radeon_set_context_reg(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, + pipeline->graphics.tess.ls_hs_config); + + struct ac_userdata_info *loc; + + loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_TESS_CTRL, AC_UD_TCS_OFFCHIP_LAYOUT); + if (loc->sgpr_idx != -1) { + uint32_t base_reg = radv_shader_stage_to_user_data_0(MESA_SHADER_TESS_CTRL, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + assert(loc->num_sgprs == 4); + assert(!loc->indirect); + radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 4); + radeon_emit(cmd_buffer->cs, pipeline->graphics.tess.offchip_layout); + radeon_emit(cmd_buffer->cs, pipeline->graphics.tess.tcs_out_offsets); + radeon_emit(cmd_buffer->cs, pipeline->graphics.tess.tcs_out_layout | + pipeline->graphics.tess.num_tcs_input_cp << 26); + radeon_emit(cmd_buffer->cs, pipeline->graphics.tess.tcs_in_layout); + } + + loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_TESS_EVAL, AC_UD_TES_OFFCHIP_LAYOUT); + if (loc->sgpr_idx != -1) { + uint32_t base_reg = radv_shader_stage_to_user_data_0(MESA_SHADER_TESS_EVAL, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + assert(loc->num_sgprs == 1); + assert(!loc->indirect); + + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, + pipeline->graphics.tess.offchip_layout); } - return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | - S_028A40_CUT_MODE(cut_mode)| - S_028A40_ES_WRITE_OPTIMIZE(1) | - S_028A40_GS_WRITE_OPTIMIZE(1); + loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_VERTEX, AC_UD_VS_LS_TCS_IN_LAYOUT); + if (loc->sgpr_idx != -1) { + uint32_t base_reg = radv_shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + assert(loc->num_sgprs == 1); + assert(!loc->indirect); + + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, + pipeline->graphics.tess.tcs_in_layout); + } } static void @@ -582,13 +820,11 @@ radv_emit_geometry_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *gs; uint64_t va; + radeon_set_context_reg(cmd_buffer->cs, R_028A40_VGT_GS_MODE, pipeline->graphics.vgt_gs_mode); + gs = pipeline->shaders[MESA_SHADER_GEOMETRY]; - if (!gs) { - radeon_set_context_reg(cmd_buffer->cs, R_028A40_VGT_GS_MODE, 0); + if (!gs) return; - } - - radeon_set_context_reg(cmd_buffer->cs, R_028A40_VGT_GS_MODE, si_vgt_gs_mode(gs)); uint32_t gsvs_itemsize = gs->info.gs.max_gsvs_emit_size >> 2; @@ -613,15 +849,17 @@ radv_emit_geometry_shader(struct radv_cmd_buffer *cmd_buffer, S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0)); - va = ws->buffer_get_va(gs->bo); + va = radv_buffer_get_va(gs->bo) + gs->bo_offset; ws->cs_add_buffer(cmd_buffer->cs, gs->bo, 8); + radv_emit_prefetch(cmd_buffer, va, gs->code_size); + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4); radeon_emit(cmd_buffer->cs, va >> 8); radeon_emit(cmd_buffer->cs, va >> 40); radeon_emit(cmd_buffer->cs, gs->rsrc1); radeon_emit(cmd_buffer->cs, gs->rsrc2); - radv_emit_hw_vs(cmd_buffer, pipeline, pipeline->gs_copy_shader); + radv_emit_hw_vs(cmd_buffer, pipeline, pipeline->gs_copy_shader, &pipeline->gs_copy_shader->info.vs.outinfo); struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY, AC_UD_GS_VS_RING_STRIDE_ENTRIES); @@ -645,18 +883,16 @@ radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) { struct radeon_winsys *ws = cmd_buffer->device->ws; - struct radv_shader_variant *ps, *vs; + struct radv_shader_variant *ps; uint64_t va; unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1); struct radv_blend_state *blend = &pipeline->graphics.blend; - unsigned ps_offset = 0; - unsigned z_order; assert (pipeline->shaders[MESA_SHADER_FRAGMENT]); ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; - vs = radv_pipeline_has_gs(pipeline) ? pipeline->gs_copy_shader : pipeline->shaders[MESA_SHADER_VERTEX]; - va = ws->buffer_get_va(ps->bo); + va = radv_buffer_get_va(ps->bo) + ps->bo_offset; ws->cs_add_buffer(cmd_buffer->cs, ps->bo, 8); + radv_emit_prefetch(cmd_buffer, va, ps->code_size); radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4); radeon_emit(cmd_buffer->cs, va >> 8); @@ -664,20 +900,8 @@ radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cmd_buffer->cs, ps->rsrc1); radeon_emit(cmd_buffer->cs, ps->rsrc2); - if (ps->info.fs.early_fragment_test || !ps->info.fs.writes_memory) - z_order = V_02880C_EARLY_Z_THEN_LATE_Z; - else - z_order = V_02880C_LATE_Z; - - radeon_set_context_reg(cmd_buffer->cs, R_02880C_DB_SHADER_CONTROL, - S_02880C_Z_EXPORT_ENABLE(ps->info.fs.writes_z) | - S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.fs.writes_stencil) | - S_02880C_KILL_ENABLE(!!ps->info.fs.can_discard) | - S_02880C_Z_ORDER(z_order) | - S_02880C_DEPTH_BEFORE_SHADER(ps->info.fs.early_fragment_test) | - S_02880C_EXEC_ON_HIER_FAIL(ps->info.fs.writes_memory) | - S_02880C_EXEC_ON_NOOP(ps->info.fs.writes_memory)); + pipeline->graphics.db_shader_control); radeon_set_context_reg(cmd_buffer->cs, R_0286CC_SPI_PS_INPUT_ENA, ps->config.spi_ps_input_ena); @@ -685,85 +909,56 @@ radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer, radeon_set_context_reg(cmd_buffer->cs, R_0286D0_SPI_PS_INPUT_ADDR, ps->config.spi_ps_input_addr); - spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(0); + if (ps->info.info.ps.force_persample) + spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2); + radeon_set_context_reg(cmd_buffer->cs, R_0286D8_SPI_PS_IN_CONTROL, S_0286D8_NUM_INTERP(ps->info.fs.num_interp)); radeon_set_context_reg(cmd_buffer->cs, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl); radeon_set_context_reg(cmd_buffer->cs, R_028710_SPI_SHADER_Z_FORMAT, - ps->info.fs.writes_stencil ? V_028710_SPI_SHADER_32_GR : - ps->info.fs.writes_z ? V_028710_SPI_SHADER_32_R : - V_028710_SPI_SHADER_ZERO); + pipeline->graphics.shader_z_format); radeon_set_context_reg(cmd_buffer->cs, R_028714_SPI_SHADER_COL_FORMAT, blend->spi_shader_col_format); radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK, blend->cb_target_mask); radeon_set_context_reg(cmd_buffer->cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask); - if (ps->info.fs.has_pcoord) { - unsigned val; - val = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20); - radeon_set_context_reg(cmd_buffer->cs, R_028644_SPI_PS_INPUT_CNTL_0 + 4 * ps_offset, val); - ps_offset++; - } - - if (ps->info.fs.prim_id_input && (vs->info.vs.prim_id_output != 0xffffffff)) { - unsigned vs_offset, flat_shade; - unsigned val; - vs_offset = vs->info.vs.prim_id_output; - flat_shade = true; - val = S_028644_OFFSET(vs_offset) | S_028644_FLAT_SHADE(flat_shade); - radeon_set_context_reg(cmd_buffer->cs, R_028644_SPI_PS_INPUT_CNTL_0 + 4 * ps_offset, val); - ++ps_offset; + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + /* optimise this? */ + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); } - if (ps->info.fs.layer_input && (vs->info.vs.layer_output != 0xffffffff)) { - unsigned vs_offset, flat_shade; - unsigned val; - vs_offset = vs->info.vs.layer_output; - flat_shade = true; - val = S_028644_OFFSET(vs_offset) | S_028644_FLAT_SHADE(flat_shade); - radeon_set_context_reg(cmd_buffer->cs, R_028644_SPI_PS_INPUT_CNTL_0 + 4 * ps_offset, val); - ++ps_offset; - } - - for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.fs.input_mask; ++i) { - unsigned vs_offset, flat_shade; - unsigned val; - - if (!(ps->info.fs.input_mask & (1u << i))) - continue; - - - if (!(vs->info.vs.export_mask & (1u << i))) { - radeon_set_context_reg(cmd_buffer->cs, R_028644_SPI_PS_INPUT_CNTL_0 + 4 * ps_offset, - S_028644_OFFSET(0x20)); - ++ps_offset; - continue; + if (pipeline->graphics.ps_input_cntl_num) { + radeon_set_context_reg_seq(cmd_buffer->cs, R_028644_SPI_PS_INPUT_CNTL_0, pipeline->graphics.ps_input_cntl_num); + for (unsigned i = 0; i < pipeline->graphics.ps_input_cntl_num; i++) { + radeon_emit(cmd_buffer->cs, pipeline->graphics.ps_input_cntl[i]); } + } +} - vs_offset = util_bitcount(vs->info.vs.export_mask & ((1u << i) - 1)); - if (vs->info.vs.prim_id_output != 0xffffffff) { - if (vs_offset >= vs->info.vs.prim_id_output) - vs_offset++; - } - if (vs->info.vs.layer_output != 0xffffffff) { - if (vs_offset >= vs->info.vs.layer_output) - vs_offset++; - } - flat_shade = !!(ps->info.fs.flat_shaded_mask & (1u << ps_offset)); +static void polaris_set_vgt_vertex_reuse(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline) +{ + uint32_t vtx_reuse_depth = 30; + if (cmd_buffer->device->physical_device->rad_info.family < CHIP_POLARIS10) + return; - val = S_028644_OFFSET(vs_offset) | S_028644_FLAT_SHADE(flat_shade); - radeon_set_context_reg(cmd_buffer->cs, R_028644_SPI_PS_INPUT_CNTL_0 + 4 * ps_offset, val); - ++ps_offset; + if (pipeline->shaders[MESA_SHADER_TESS_EVAL]) { + if (pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.spacing == TESS_SPACING_FRACTIONAL_ODD) + vtx_reuse_depth = 14; } + radeon_set_context_reg(cmd_buffer->cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, + vtx_reuse_depth); } static void -radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer, - struct radv_pipeline *pipeline) +radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) { + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) return; @@ -772,11 +967,10 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer, radv_emit_graphics_raster_state(cmd_buffer, pipeline); radv_update_multisample_state(cmd_buffer, pipeline); radv_emit_vertex_shader(cmd_buffer, pipeline); + radv_emit_tess_shaders(cmd_buffer, pipeline); radv_emit_geometry_shader(cmd_buffer, pipeline); radv_emit_fragment_shader(cmd_buffer, pipeline); - - radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, - pipeline->graphics.prim_restart_enable); + polaris_set_vgt_vertex_reuse(cmd_buffer, pipeline); cmd_buffer->scratch_size_needed = MAX2(cmd_buffer->scratch_size_needed, @@ -785,6 +979,23 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer, radeon_set_context_reg(cmd_buffer->cs, R_0286E8_SPI_TMPRING_SIZE, S_0286E8_WAVES(pipeline->max_waves) | S_0286E8_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10)); + + if (!cmd_buffer->state.emitted_pipeline || + cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband != + pipeline->graphics.can_use_guardband) + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; + + radeon_set_context_reg(cmd_buffer->cs, R_028B54_VGT_SHADER_STAGES_EN, pipeline->graphics.vgt_shader_stages_en); + + if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) { + radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, pipeline->graphics.prim); + } else { + radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, pipeline->graphics.prim); + } + radeon_set_context_reg(cmd_buffer->cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, pipeline->graphics.gs_out); + + radv_save_pipeline(cmd_buffer, pipeline, RING_GFX); + cmd_buffer->state.emitted_pipeline = pipeline; } @@ -799,97 +1010,193 @@ static void radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer) { uint32_t count = cmd_buffer->state.dynamic.scissor.count; + + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH; + si_emit_cache_flush(cmd_buffer); + } si_write_scissors(cmd_buffer->cs, 0, count, - cmd_buffer->state.dynamic.scissor.scissors); + cmd_buffer->state.dynamic.scissor.scissors, + cmd_buffer->state.dynamic.viewport.viewports, + cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband); radeon_set_context_reg(cmd_buffer->cs, R_028A48_PA_SC_MODE_CNTL_0, cmd_buffer->state.pipeline->graphics.ms.pa_sc_mode_cntl_0 | S_028A48_VPORT_SCISSOR_ENABLE(count ? 1 : 0)); } static void -radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, - int index, - struct radv_color_buffer_info *cb) +radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer) { - bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= VI; - radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); - radeon_emit(cmd_buffer->cs, cb->cb_color_base); - radeon_emit(cmd_buffer->cs, cb->cb_color_pitch); - radeon_emit(cmd_buffer->cs, cb->cb_color_slice); - radeon_emit(cmd_buffer->cs, cb->cb_color_view); - radeon_emit(cmd_buffer->cs, cb->cb_color_info); - radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); - radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); - radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); - radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice); - radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); - radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice); + unsigned width = cmd_buffer->state.dynamic.line_width * 8; - if (is_vi) { /* DCC BASE */ - radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base); - } + radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL, + S_028A08_WIDTH(CLAMP(width, 0, 0xFFF))); } static void -radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, - struct radv_ds_buffer_info *ds, - struct radv_image *image, - VkImageLayout layout) +radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer) { - uint32_t db_z_info = ds->db_z_info; - - if (!radv_layout_has_htile(image, layout)) - db_z_info &= C_028040_TILE_SURFACE_ENABLE; - - if (!radv_layout_can_expclear(image, layout)) - db_z_info &= C_028040_ALLOW_EXPCLEAR & C_028044_ALLOW_EXPCLEAR; - - radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view); - radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base); - - radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9); - radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */ - radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */ - radeon_emit(cmd_buffer->cs, ds->db_stencil_info); /* R_028044_DB_STENCIL_INFO */ - radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */ - radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */ - radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */ - radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */ - radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */ - radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */ + struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; - radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface); - radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, - ds->pa_su_poly_offset_db_fmt_cntl); + radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4); + radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4); } -/* - * To hw resolve multisample images both src and dst need to have the same - * micro tiling mode. However we don't always know in advance when creating - * the images. This function gets called if we have a resolve attachment, - * and tests if the attachment image has the same tiling mode, then it - * checks if the generated framebuffer data has the same tiling mode, and - * updates it if not. - */ -static void radv_set_optimal_micro_tile_mode(struct radv_device *device, - struct radv_attachment_info *att, - uint32_t micro_tile_mode) +static void +radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer) { - struct radv_image *image = att->attachment->image; - uint32_t tile_mode_index; - if (image->surface.nsamples <= 1) - return; + struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; - if (image->surface.micro_tile_mode != micro_tile_mode) { - radv_image_set_optimal_micro_tile_mode(device, image, micro_tile_mode); - } + radeon_set_context_reg_seq(cmd_buffer->cs, + R_028430_DB_STENCILREFMASK, 2); + radeon_emit(cmd_buffer->cs, + S_028430_STENCILTESTVAL(d->stencil_reference.front) | + S_028430_STENCILMASK(d->stencil_compare_mask.front) | + S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) | + S_028430_STENCILOPVAL(1)); + radeon_emit(cmd_buffer->cs, + S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) | + S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) | + S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) | + S_028434_STENCILOPVAL_BF(1)); +} + +static void +radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; + + radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, + fui(d->depth_bounds.min)); + radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX, + fui(d->depth_bounds.max)); +} + +static void +radv_emit_depth_biais(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_raster_state *raster = &cmd_buffer->state.pipeline->graphics.raster; + struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; + unsigned slope = fui(d->depth_bias.slope * 16.0f); + unsigned bias = fui(d->depth_bias.bias * cmd_buffer->state.offset_scale); + + if (G_028814_POLY_OFFSET_FRONT_ENABLE(raster->pa_su_sc_mode_cntl)) { + radeon_set_context_reg_seq(cmd_buffer->cs, + R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5); + radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */ + radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */ + radeon_emit(cmd_buffer->cs, bias); /* FRONT OFFSET */ + radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */ + radeon_emit(cmd_buffer->cs, bias); /* BACK OFFSET */ + } +} + +static void +radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, + int index, + struct radv_color_buffer_info *cb) +{ + bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= VI; + + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); + radeon_emit(cmd_buffer->cs, cb->cb_color_base); + radeon_emit(cmd_buffer->cs, cb->cb_color_base >> 32); + radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2); + radeon_emit(cmd_buffer->cs, cb->cb_color_view); + radeon_emit(cmd_buffer->cs, cb->cb_color_info); + radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); + radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); + radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); + radeon_emit(cmd_buffer->cs, cb->cb_color_cmask >> 32); + radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); + radeon_emit(cmd_buffer->cs, cb->cb_color_fmask >> 32); + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2); + radeon_emit(cmd_buffer->cs, cb->cb_dcc_base); + radeon_emit(cmd_buffer->cs, cb->cb_dcc_base >> 32); + + radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4, + cb->gfx9_epitch); + } else { + radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); + radeon_emit(cmd_buffer->cs, cb->cb_color_base); + radeon_emit(cmd_buffer->cs, cb->cb_color_pitch); + radeon_emit(cmd_buffer->cs, cb->cb_color_slice); + radeon_emit(cmd_buffer->cs, cb->cb_color_view); + radeon_emit(cmd_buffer->cs, cb->cb_color_info); + radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); + radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); + radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); + radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice); + radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); + radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice); + + if (is_vi) { /* DCC BASE */ + radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base); + } + } +} + +static void +radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, + struct radv_ds_buffer_info *ds, + struct radv_image *image, + VkImageLayout layout) +{ + uint32_t db_z_info = ds->db_z_info; + uint32_t db_stencil_info = ds->db_stencil_info; + + if (!radv_layout_has_htile(image, layout, + radv_image_queue_family_mask(image, + cmd_buffer->queue_family_index, + cmd_buffer->queue_family_index))) { + db_z_info &= C_028040_TILE_SURFACE_ENABLE; + db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1); + } + + radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view); + radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface); - if (att->cb.micro_tile_mode != micro_tile_mode) { - tile_mode_index = image->surface.tiling_index[0]; - att->cb.cb_color_attrib &= C_028C74_TILE_MODE_INDEX; - att->cb.cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index); - att->cb.micro_tile_mode = micro_tile_mode; + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3); + radeon_emit(cmd_buffer->cs, ds->db_htile_data_base); + radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32); + radeon_emit(cmd_buffer->cs, ds->db_depth_size); + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10); + radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */ + radeon_emit(cmd_buffer->cs, db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32); /* DB_Z_READ_BASE_HI */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32); /* DB_STENCIL_READ_BASE_HI */ + radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_z_write_base >> 32); /* DB_Z_WRITE_BASE_HI */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2); + radeon_emit(cmd_buffer->cs, ds->db_z_info2); + radeon_emit(cmd_buffer->cs, ds->db_stencil_info2); + } else { + radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base); + + radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9); + radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */ + radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */ + radeon_emit(cmd_buffer->cs, db_stencil_info); /* R_028044_DB_STENCIL_INFO */ + radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */ + radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */ + } + + radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + ds->pa_su_poly_offset_db_fmt_cntl); } void @@ -898,11 +1205,11 @@ radv_set_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer, VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) { - uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo); + uint64_t va = radv_buffer_get_va(image->bo); va += image->offset + image->clear_value_offset; unsigned reg_offset = 0, reg_count = 0; - if (!image->htile.size || !aspects) + if (!image->surface.htile_size || !aspects) return; if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { @@ -938,10 +1245,10 @@ static void radv_load_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image) { - uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo); + uint64_t va = radv_buffer_get_va(image->bo); va += image->offset + image->clear_value_offset; - if (!image->htile.size) + if (!image->surface.htile_size) return; cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, image->bo, 8); @@ -959,13 +1266,42 @@ radv_load_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cmd_buffer->cs, 0); } +/* + *with DCC some colors don't require CMASK elimiation before being + * used as a texture. This sets a predicate value to determine if the + * cmask eliminate is required. + */ +void +radv_set_dcc_need_cmask_elim_pred(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + bool value) +{ + uint64_t pred_val = value; + uint64_t va = radv_buffer_get_va(image->bo); + va += image->offset + image->dcc_pred_offset; + + if (!image->surface.dcc_size) + return; + + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, image->bo, 8); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 4, 0)); + radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM_ASYNC) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_PFP)); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); + radeon_emit(cmd_buffer->cs, pred_val); + radeon_emit(cmd_buffer->cs, pred_val >> 32); +} + void radv_set_color_clear_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, int idx, uint32_t color_values[2]) { - uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo); + uint64_t va = radv_buffer_get_va(image->bo); va += image->offset + image->clear_value_offset; if (!image->cmask.size && !image->surface.dcc_size) @@ -992,7 +1328,7 @@ radv_load_color_clear_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, int idx) { - uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo); + uint64_t va = radv_buffer_get_va(image->bo); va += image->offset + image->clear_value_offset; if (!image->cmask.size && !image->surface.dcc_size) @@ -1001,7 +1337,7 @@ radv_load_color_clear_regs(struct radv_cmd_buffer *cmd_buffer, uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + idx * 0x3c; cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, image->bo, 8); - radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating)); radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_COUNT_SEL); @@ -1010,7 +1346,7 @@ radv_load_color_clear_regs(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cmd_buffer->cs, reg >> 2); radeon_emit(cmd_buffer->cs, 0); - radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); radeon_emit(cmd_buffer->cs, 0); } @@ -1020,21 +1356,21 @@ radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer) int i; struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; const struct radv_subpass *subpass = cmd_buffer->state.subpass; - int dst_resolve_micro_tile_mode = -1; - if (subpass->has_resolve) { - uint32_t a = subpass->resolve_attachments[0].attachment; - const struct radv_image *image = framebuffer->attachments[a].attachment->image; - dst_resolve_micro_tile_mode = image->surface.micro_tile_mode; - } - for (i = 0; i < subpass->color_count; ++i) { + /* this may happen for inherited secondary recording */ + if (!framebuffer) + return; + + for (i = 0; i < 8; ++i) { + if (i >= subpass->color_count || subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { + radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, + S_028C70_FORMAT(V_028C70_COLOR_INVALID)); + continue; + } + int idx = subpass->color_attachments[i].attachment; struct radv_attachment_info *att = &framebuffer->attachments[idx]; - if (dst_resolve_micro_tile_mode != -1) { - radv_set_optimal_micro_tile_mode(cmd_buffer->device, - att, dst_resolve_micro_tile_mode); - } cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, att->attachment->bo, 8); assert(att->attachment->aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT); @@ -1043,16 +1379,18 @@ radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer) radv_load_color_clear_regs(cmd_buffer, att->attachment->image, i); } - for (i = subpass->color_count; i < 8; i++) - radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, - S_028C70_FORMAT(V_028C70_COLOR_INVALID)); - if(subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) { int idx = subpass->depth_stencil_attachment.attachment; VkImageLayout layout = subpass->depth_stencil_attachment.layout; struct radv_attachment_info *att = &framebuffer->attachments[idx]; struct radv_image *image = att->attachment->image; cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, att->attachment->bo, 8); + MAYBE_UNUSED uint32_t queue_mask = radv_image_queue_family_mask(image, + cmd_buffer->queue_family_index, + cmd_buffer->queue_family_index); + /* We currently don't support writing decompressed HTILE */ + assert(radv_layout_has_htile(image, layout, queue_mask) == + radv_layout_is_htile_compressed(image, layout, queue_mask)); radv_emit_fb_ds_state(cmd_buffer, &att->ds, image, layout); @@ -1062,13 +1400,22 @@ radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer) } radv_load_depth_clear_regs(cmd_buffer, image); } else { - radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2); - radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* R_028040_DB_Z_INFO */ - radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* R_028044_DB_STENCIL_INFO */ + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) + radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2); + else + radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2); + + radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */ + radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ } radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height)); + + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + } } void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer) @@ -1100,54 +1447,33 @@ void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer) static void radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer) { - struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; + if (G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.pipeline->graphics.raster.pa_cl_clip_cntl)) + return; - if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH) { - unsigned width = cmd_buffer->state.dynamic.line_width * 8; - radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL, - S_028A08_WIDTH(CLAMP(width, 0, 0xFFF))); - } + if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT)) + radv_emit_viewport(cmd_buffer); - if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) { - radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4); - radeon_emit_array(cmd_buffer->cs, (uint32_t*)d->blend_constants, 4); - } + if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT)) + radv_emit_scissor(cmd_buffer); + + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH) + radv_emit_line_width(cmd_buffer); + + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) + radv_emit_blend_constants(cmd_buffer); if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK | - RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK)) { - radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2); - radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) | - S_028430_STENCILMASK(d->stencil_compare_mask.front) | - S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) | - S_028430_STENCILOPVAL(1)); - radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) | - S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) | - S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) | - S_028434_STENCILOPVAL_BF(1)); - } + RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK)) + radv_emit_stencil(cmd_buffer); if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_PIPELINE | - RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)) { - radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, fui(d->depth_bounds.min)); - radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX, fui(d->depth_bounds.max)); - } + RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)) + radv_emit_depth_bounds(cmd_buffer); if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_PIPELINE | - RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)) { - struct radv_raster_state *raster = &cmd_buffer->state.pipeline->graphics.raster; - unsigned slope = fui(d->depth_bias.slope * 16.0f); - unsigned bias = fui(d->depth_bias.bias * cmd_buffer->state.offset_scale); - - if (G_028814_POLY_OFFSET_FRONT_ENABLE(raster->pa_su_sc_mode_cntl)) { - radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5); - radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */ - radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */ - radeon_emit(cmd_buffer->cs, bias); /* FRONT OFFSET */ - radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */ - radeon_emit(cmd_buffer->cs, bias); /* BACK OFFSET */ - } - } + RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)) + radv_emit_depth_biais(cmd_buffer); cmd_buffer->state.dirty = 0; } @@ -1160,9 +1486,9 @@ emit_stage_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer, gl_shader_stage stage) { struct ac_userdata_info *desc_set_loc = &pipeline->shaders[stage]->info.user_sgprs_locs.descriptor_sets[idx]; - uint32_t base_reg = shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline)); + uint32_t base_reg = radv_shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); - if (desc_set_loc->sgpr_idx == -1) + if (desc_set_loc->sgpr_idx == -1 || desc_set_loc->indirect) return; assert(!desc_set_loc->indirect); @@ -1175,51 +1501,125 @@ emit_stage_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer, static void radv_emit_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer, - struct radv_pipeline *pipeline, VkShaderStageFlags stages, struct radv_descriptor_set *set, unsigned idx) { - if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) - emit_stage_descriptor_set_userdata(cmd_buffer, pipeline, - idx, set->va, - MESA_SHADER_FRAGMENT); + if (cmd_buffer->state.pipeline) { + radv_foreach_stage(stage, stages) { + if (cmd_buffer->state.pipeline->shaders[stage]) + emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline, + idx, set->va, + stage); + } + } - if (stages & VK_SHADER_STAGE_VERTEX_BIT) - emit_stage_descriptor_set_userdata(cmd_buffer, pipeline, + if (cmd_buffer->state.compute_pipeline && (stages & VK_SHADER_STAGE_COMPUTE_BIT)) + emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.compute_pipeline, idx, set->va, - MESA_SHADER_VERTEX); + MESA_SHADER_COMPUTE); +} - if ((stages & VK_SHADER_STAGE_GEOMETRY_BIT) && radv_pipeline_has_gs(pipeline)) - emit_stage_descriptor_set_userdata(cmd_buffer, pipeline, - idx, set->va, - MESA_SHADER_GEOMETRY); +static void +radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_descriptor_set *set = &cmd_buffer->push_descriptors.set; + unsigned bo_offset; - if (stages & VK_SHADER_STAGE_COMPUTE_BIT) - emit_stage_descriptor_set_userdata(cmd_buffer, pipeline, - idx, set->va, - MESA_SHADER_COMPUTE); + if (!radv_cmd_buffer_upload_data(cmd_buffer, set->size, 32, + set->mapped_ptr, + &bo_offset)) + return; + + set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); + set->va += bo_offset; +} + +static void +radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer) +{ + uint32_t size = MAX_SETS * 2 * 4; + uint32_t offset; + void *ptr; + + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, + 256, &offset, &ptr)) + return; + + for (unsigned i = 0; i < MAX_SETS; i++) { + uint32_t *uptr = ((uint32_t *)ptr) + i * 2; + uint64_t set_va = 0; + struct radv_descriptor_set *set = cmd_buffer->state.descriptors[i]; + if (set) + set_va = set->va; + uptr[0] = set_va & 0xffffffff; + uptr[1] = set_va >> 32; + } + + uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); + va += offset; + + if (cmd_buffer->state.pipeline) { + if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX]) + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX, + AC_UD_INDIRECT_DESCRIPTOR_SETS, va); + + if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]) + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_FRAGMENT, + AC_UD_INDIRECT_DESCRIPTOR_SETS, va); + + if (radv_pipeline_has_gs(cmd_buffer->state.pipeline)) + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY, + AC_UD_INDIRECT_DESCRIPTOR_SETS, va); + + if (radv_pipeline_has_tess(cmd_buffer->state.pipeline)) + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_CTRL, + AC_UD_INDIRECT_DESCRIPTOR_SETS, va); + + if (radv_pipeline_has_tess(cmd_buffer->state.pipeline)) + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_EVAL, + AC_UD_INDIRECT_DESCRIPTOR_SETS, va); + } + + if (cmd_buffer->state.compute_pipeline) + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.compute_pipeline, MESA_SHADER_COMPUTE, + AC_UD_INDIRECT_DESCRIPTOR_SETS, va); } static void radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, - struct radv_pipeline *pipeline, VkShaderStageFlags stages) { unsigned i; + if (!cmd_buffer->state.descriptors_dirty) return; - for (i = 0; i < MAX_SETS; i++) { - if (!(cmd_buffer->state.descriptors_dirty & (1 << i))) - continue; + if (cmd_buffer->state.push_descriptors_dirty) + radv_flush_push_descriptors(cmd_buffer); + + if ((cmd_buffer->state.pipeline && cmd_buffer->state.pipeline->need_indirect_descriptor_sets) || + (cmd_buffer->state.compute_pipeline && cmd_buffer->state.compute_pipeline->need_indirect_descriptor_sets)) { + radv_flush_indirect_descriptor_sets(cmd_buffer); + } + + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, + cmd_buffer->cs, + MAX_SETS * MESA_SHADER_STAGES * 4); + + for_each_bit(i, cmd_buffer->state.descriptors_dirty) { struct radv_descriptor_set *set = cmd_buffer->state.descriptors[i]; if (!set) continue; - radv_emit_descriptor_set_userdata(cmd_buffer, pipeline, stages, set, i); + radv_emit_descriptor_set_userdata(cmd_buffer, stages, set, i); } cmd_buffer->state.descriptors_dirty = 0; + cmd_buffer->state.push_descriptors_dirty = false; + + radv_save_descriptors(cmd_buffer); + + assert(cmd_buffer->cs->cdw <= cdw_max); } static void @@ -1236,129 +1636,147 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, if (!stages || !layout || (!layout->push_constant_size && !layout->dynamic_offset_count)) return; - radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size + - 16 * layout->dynamic_offset_count, - 256, &offset, &ptr); + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size + + 16 * layout->dynamic_offset_count, + 256, &offset, &ptr)) + return; memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size); memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers, 16 * layout->dynamic_offset_count); - va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); + va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); va += offset; - if (stages & VK_SHADER_STAGE_VERTEX_BIT) - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX, - AC_UD_PUSH_CONSTANTS, va); + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, + cmd_buffer->cs, MESA_SHADER_STAGES * 4); - if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT, - AC_UD_PUSH_CONSTANTS, va); + radv_foreach_stage(stage, stages) { + if (pipeline->shaders[stage]) { + radv_emit_userdata_address(cmd_buffer, pipeline, stage, + AC_UD_PUSH_CONSTANTS, va); + } + } - if ((stages & VK_SHADER_STAGE_GEOMETRY_BIT) && radv_pipeline_has_gs(pipeline)) - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_GEOMETRY, - AC_UD_PUSH_CONSTANTS, va); + cmd_buffer->push_constant_stages &= ~stages; + assert(cmd_buffer->cs->cdw <= cdw_max); +} - if (stages & VK_SHADER_STAGE_COMPUTE_BIT) - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_COMPUTE, - AC_UD_PUSH_CONSTANTS, va); +static void radv_emit_primitive_reset_state(struct radv_cmd_buffer *cmd_buffer, + bool indexed_draw) +{ + int32_t primitive_reset_en = indexed_draw && cmd_buffer->state.pipeline->graphics.prim_restart_enable; - cmd_buffer->push_constant_stages &= ~stages; + if (primitive_reset_en != cmd_buffer->state.last_primitive_reset_en) { + cmd_buffer->state.last_primitive_reset_en = primitive_reset_en; + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, + primitive_reset_en); + } else { + radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, + primitive_reset_en); + } + } + + if (primitive_reset_en) { + uint32_t primitive_reset_index = cmd_buffer->state.index_type ? 0xffffffffu : 0xffffu; + + if (primitive_reset_index != cmd_buffer->state.last_primitive_reset_index) { + cmd_buffer->state.last_primitive_reset_index = primitive_reset_index; + radeon_set_context_reg(cmd_buffer->cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, + primitive_reset_index); + } + } } -static void -radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer) +static bool +radv_cmd_buffer_update_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer) { - struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; struct radv_device *device = cmd_buffer->device; - uint32_t ia_multi_vgt_param; - uint32_t ls_hs_config = 0; - - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, - cmd_buffer->cs, 4096); - cmd_buffer->no_draws = false; - if ((cmd_buffer->state.vertex_descriptors_dirty || cmd_buffer->state.vb_dirty) && - cmd_buffer->state.pipeline->num_vertex_attribs) { + if ((cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline || cmd_buffer->state.vb_dirty) && + cmd_buffer->state.pipeline->vertex_elements.count && + cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX]->info.info.vs.has_vertex_buffers) { + struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements; unsigned vb_offset; void *vb_ptr; uint32_t i = 0; - uint32_t num_attribs = cmd_buffer->state.pipeline->num_vertex_attribs; + uint32_t count = velems->count; uint64_t va; /* allocate some descriptor state for vertex buffers */ - radv_cmd_buffer_upload_alloc(cmd_buffer, num_attribs * 16, 256, - &vb_offset, &vb_ptr); + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, count * 16, 256, + &vb_offset, &vb_ptr)) + return false; - for (i = 0; i < num_attribs; i++) { + for (i = 0; i < count; i++) { uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4]; uint32_t offset; - int vb = cmd_buffer->state.pipeline->va_binding[i]; + int vb = velems->binding[i]; struct radv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer; uint32_t stride = cmd_buffer->state.pipeline->binding_stride[vb]; device->ws->cs_add_buffer(cmd_buffer->cs, buffer->bo, 8); - va = device->ws->buffer_get_va(buffer->bo); + va = radv_buffer_get_va(buffer->bo); - offset = cmd_buffer->state.vertex_bindings[vb].offset + cmd_buffer->state.pipeline->va_offset[i]; + offset = cmd_buffer->state.vertex_bindings[vb].offset + velems->offset[i]; va += offset + buffer->offset; desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); if (cmd_buffer->device->physical_device->rad_info.chip_class <= CIK && stride) - desc[2] = (buffer->size - offset - cmd_buffer->state.pipeline->va_format_size[i]) / stride + 1; + desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1; else desc[2] = buffer->size - offset; - desc[3] = cmd_buffer->state.pipeline->va_rsrc_word3[i]; + desc[3] = velems->rsrc_word3[i]; } - va = device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); + va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); va += vb_offset; - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX, + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS, va); } + cmd_buffer->state.vb_dirty = false; - cmd_buffer->state.vertex_descriptors_dirty = false; - cmd_buffer->state.vb_dirty = 0; - if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) - radv_emit_graphics_pipeline(cmd_buffer, pipeline); - - if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_RENDER_TARGETS) - radv_emit_framebuffer_state(cmd_buffer); + return true; +} - if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT)) - radv_emit_viewport(cmd_buffer); +static void +radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer, + bool indexed_draw, bool instanced_draw, + bool indirect_draw, + uint32_t draw_vertex_count) +{ + uint32_t ia_multi_vgt_param; - if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR)) - radv_emit_scissor(cmd_buffer); + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, + cmd_buffer->cs, 4096); - if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) { - uint32_t stages = 0; + if (!radv_cmd_buffer_update_vertex_descriptors(cmd_buffer)) + return; - if (radv_pipeline_has_gs(cmd_buffer->state.pipeline)) - stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | - S_028B54_GS_EN(1) | - S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER); + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) + radv_emit_graphics_pipeline(cmd_buffer); - radeon_set_context_reg(cmd_buffer->cs, R_028B54_VGT_SHADER_STAGES_EN, stages); - ia_multi_vgt_param = si_get_ia_multi_vgt_param(cmd_buffer); + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_RENDER_TARGETS) + radv_emit_framebuffer_state(cmd_buffer); - if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) { + ia_multi_vgt_param = si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, draw_vertex_count); + if (cmd_buffer->state.last_ia_multi_vgt_param != ia_multi_vgt_param) { + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) + radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param); + else if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) radeon_set_context_reg_idx(cmd_buffer->cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param); - radeon_set_context_reg_idx(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config); - radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, cmd_buffer->state.pipeline->graphics.prim); - } else { - radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, cmd_buffer->state.pipeline->graphics.prim); + else radeon_set_context_reg(cmd_buffer->cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); - radeon_set_context_reg(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config); - } - radeon_set_context_reg(cmd_buffer->cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, cmd_buffer->state.pipeline->graphics.gs_out); + cmd_buffer->state.last_ia_multi_vgt_param = ia_multi_vgt_param; } radv_cmd_buffer_flush_dynamic_state(cmd_buffer); - radv_flush_descriptors(cmd_buffer, cmd_buffer->state.pipeline, - VK_SHADER_STAGE_ALL_GRAPHICS); + radv_emit_primitive_reset_state(cmd_buffer, indexed_draw); + + radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); radv_flush_constants(cmd_buffer, cmd_buffer->state.pipeline, VK_SHADER_STAGE_ALL_GRAPHICS); @@ -1397,11 +1815,86 @@ static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, } } +static enum radv_cmd_flush_bits +radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, + VkAccessFlags src_flags) +{ + enum radv_cmd_flush_bits flush_bits = 0; + uint32_t b; + for_each_bit(b, src_flags) { + switch ((VkAccessFlagBits)(1 << b)) { + case VK_ACCESS_SHADER_WRITE_BIT: + flush_bits |= RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2; + break; + case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: + flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; + break; + case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: + flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | + RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; + break; + case VK_ACCESS_TRANSFER_WRITE_BIT: + flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | + RADV_CMD_FLAG_FLUSH_AND_INV_DB | + RADV_CMD_FLAG_FLUSH_AND_INV_DB_META | + RADV_CMD_FLAG_INV_GLOBAL_L2; + break; + default: + break; + } + } + return flush_bits; +} + +static enum radv_cmd_flush_bits +radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, + VkAccessFlags dst_flags, + struct radv_image *image) +{ + enum radv_cmd_flush_bits flush_bits = 0; + uint32_t b; + for_each_bit(b, dst_flags) { + switch ((VkAccessFlagBits)(1 << b)) { + case VK_ACCESS_INDIRECT_COMMAND_READ_BIT: + case VK_ACCESS_INDEX_READ_BIT: + case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT: + break; + case VK_ACCESS_UNIFORM_READ_BIT: + flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 | RADV_CMD_FLAG_INV_SMEM_L1; + break; + case VK_ACCESS_SHADER_READ_BIT: + case VK_ACCESS_TRANSFER_READ_BIT: + case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT: + flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_GLOBAL_L2; + break; + case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT: + /* TODO: change to image && when the image gets passed + * through from the subpass. */ + if (!image || (image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) + flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; + break; + case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT: + if (!image || (image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) + flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | + RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; + break; + default: + break; + } + } + return flush_bits; +} + static void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass_barrier *barrier) { + cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_access_mask); radv_stage_flush(cmd_buffer, barrier->src_stage_mask); - - /* TODO: actual cache flushes */ + cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, + NULL); } static void radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer, @@ -1419,7 +1912,7 @@ static void radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buf radv_handle_image_transition(cmd_buffer, view->image, cmd_buffer->state.attachments[idx].current_layout, - att.layout, 0, 0, range, + att.layout, 0, 0, &range, cmd_buffer->state.attachments[idx].pending_clear_aspects); cmd_buffer->state.attachments[idx].current_layout = att.layout; @@ -1435,8 +1928,9 @@ radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, radv_subpass_barrier(cmd_buffer, &subpass->start_barrier); for (unsigned i = 0; i < subpass->color_count; ++i) { - radv_handle_subpass_image_transition(cmd_buffer, - subpass->color_attachments[i]); + if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED) + radv_handle_subpass_image_transition(cmd_buffer, + subpass->color_attachments[i]); } for (unsigned i = 0; i < subpass->input_count; ++i) { @@ -1455,7 +1949,7 @@ radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RENDER_TARGETS; } -static void +static VkResult radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass, const VkRenderPassBeginInfo *info) @@ -1464,7 +1958,7 @@ radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, if (pass->attachment_count == 0) { state->attachments = NULL; - return; + return VK_SUCCESS; } state->attachments = vk_alloc(&cmd_buffer->pool->alloc, @@ -1472,8 +1966,8 @@ radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, sizeof(state->attachments[0]), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (state->attachments == NULL) { - /* FIXME: Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ - abort(); + cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; + return cmd_buffer->record_result; } for (uint32_t i = 0; i < pass->attachment_count; ++i) { @@ -1491,6 +1985,9 @@ radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && + att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE) + clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; } if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { @@ -1499,6 +1996,7 @@ radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, } state->attachments[i].pending_clear_aspects = clear_aspects; + state->attachments[i].cleared_views = 0; if (clear_aspects && info) { assert(info->clearValueCount > i); state->attachments[i].clear_value = info->pClearValues[i]; @@ -1506,6 +2004,8 @@ radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, state->attachments[i].current_layout = att->initial_layout; } + + return VK_SUCCESS; } VkResult radv_AllocateCommandBuffers( @@ -1523,8 +2023,22 @@ VkResult radv_AllocateCommandBuffers( sizeof(*pCommandBuffers)*pAllocateInfo->commandBufferCount); for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { - result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, - &pCommandBuffers[i]); + + if (!list_empty(&pool->free_cmd_buffers)) { + struct radv_cmd_buffer *cmd_buffer = list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link); + + list_del(&cmd_buffer->pool_link); + list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); + + result = radv_reset_cmd_buffer(cmd_buffer); + cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC; + cmd_buffer->level = pAllocateInfo->level; + + pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer); + } else { + result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, + &pCommandBuffers[i]); + } if (result != VK_SUCCESS) break; } @@ -1536,24 +2050,6 @@ VkResult radv_AllocateCommandBuffers( return result; } -static void -radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer) -{ - list_del(&cmd_buffer->pool_link); - - list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, - &cmd_buffer->upload.list, list) { - cmd_buffer->device->ws->buffer_destroy(up->upload_bo); - list_del(&up->list); - free(up); - } - - if (cmd_buffer->upload.upload_bo) - cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo); - cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs); - vk_free(&cmd_buffer->pool->alloc, cmd_buffer); -} - void radv_FreeCommandBuffers( VkDevice device, VkCommandPool commandPool, @@ -1563,37 +2059,15 @@ void radv_FreeCommandBuffers( for (uint32_t i = 0; i < commandBufferCount; i++) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]); - if (cmd_buffer) - radv_cmd_buffer_destroy(cmd_buffer); - } -} + if (cmd_buffer) { + if (cmd_buffer->pool) { + list_del(&cmd_buffer->pool_link); + list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers); + } else + radv_cmd_buffer_destroy(cmd_buffer); -static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) -{ - - cmd_buffer->device->ws->cs_reset(cmd_buffer->cs); - - list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, - &cmd_buffer->upload.list, list) { - cmd_buffer->device->ws->buffer_destroy(up->upload_bo); - list_del(&up->list); - free(up); + } } - - cmd_buffer->scratch_size_needed = 0; - cmd_buffer->compute_scratch_size_needed = 0; - cmd_buffer->esgs_ring_size_needed = 0; - cmd_buffer->gsvs_ring_size_needed = 0; - - if (cmd_buffer->upload.upload_bo) - cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, - cmd_buffer->upload.upload_bo, 8); - cmd_buffer->upload.offset = 0; - - cmd_buffer->record_fail = false; - - cmd_buffer->ring_offsets_idx = -1; - cmd_buffer->no_draws = true; } VkResult radv_ResetCommandBuffer( @@ -1601,19 +2075,18 @@ VkResult radv_ResetCommandBuffer( VkCommandBufferResetFlags flags) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - radv_reset_cmd_buffer(cmd_buffer); - return VK_SUCCESS; + return radv_reset_cmd_buffer(cmd_buffer); } static void emit_gfx_buffer_state(struct radv_cmd_buffer *cmd_buffer) { struct radv_device *device = cmd_buffer->device; if (device->gfx_init) { - uint64_t va = device->ws->buffer_get_va(device->gfx_init); + uint64_t va = radv_buffer_get_va(device->gfx_init); device->ws->cs_add_buffer(cmd_buffer->cs, device->gfx_init, 8); radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); radeon_emit(cmd_buffer->cs, va); - radeon_emit(cmd_buffer->cs, (va >> 32) & 0xffff); + radeon_emit(cmd_buffer->cs, va >> 32); radeon_emit(cmd_buffer->cs, device->gfx_init_size_dw & 0xffff); } else si_init_config(cmd_buffer); @@ -1624,34 +2097,25 @@ VkResult radv_BeginCommandBuffer( const VkCommandBufferBeginInfo *pBeginInfo) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - radv_reset_cmd_buffer(cmd_buffer); + VkResult result; + + result = radv_reset_cmd_buffer(cmd_buffer); + if (result != VK_SUCCESS) + return result; memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); + cmd_buffer->state.last_primitive_reset_en = -1; + cmd_buffer->usage_flags = pBeginInfo->flags; /* setup initial configuration into command buffer */ if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { switch (cmd_buffer->queue_family_index) { case RADV_QUEUE_GENERAL: - /* Flush read caches at the beginning of CS not flushed by the kernel. */ - cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_ICACHE | - RADV_CMD_FLAG_PS_PARTIAL_FLUSH | - RADV_CMD_FLAG_CS_PARTIAL_FLUSH | - RADV_CMD_FLAG_INV_VMEM_L1 | - RADV_CMD_FLAG_INV_SMEM_L1 | - RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER | - RADV_CMD_FLAG_INV_GLOBAL_L2; emit_gfx_buffer_state(cmd_buffer); radv_set_db_count_control(cmd_buffer); - si_emit_cache_flush(cmd_buffer); break; case RADV_QUEUE_COMPUTE: - cmd_buffer->state.flush_bits = RADV_CMD_FLAG_INV_ICACHE | - RADV_CMD_FLAG_CS_PARTIAL_FLUSH | - RADV_CMD_FLAG_INV_VMEM_L1 | - RADV_CMD_FLAG_INV_SMEM_L1 | - RADV_CMD_FLAG_INV_GLOBAL_L2; si_init_compute(cmd_buffer); - si_emit_cache_flush(cmd_buffer); break; case RADV_QUEUE_TRANSFER: default: @@ -1660,17 +2124,22 @@ VkResult radv_BeginCommandBuffer( } if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { + assert(pBeginInfo->pInheritanceInfo); cmd_buffer->state.framebuffer = radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer); cmd_buffer->state.pass = radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); struct radv_subpass *subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; - radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL); + result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL); + if (result != VK_SUCCESS) + return result; + radv_cmd_buffer_set_subpass(cmd_buffer, subpass, false); } - return VK_SUCCESS; + radv_cmd_buffer_trace_emit(cmd_buffer); + return result; } void radv_CmdBindVertexBuffers( @@ -1686,12 +2155,13 @@ void radv_CmdBindVertexBuffers( /* We have to defer setting up vertex buffer since we need the buffer * stride from the pipeline. */ - assert(firstBinding + bindingCount < MAX_VBS); + assert(firstBinding + bindingCount <= MAX_VBS); for (uint32_t i = 0; i < bindingCount; i++) { vb[firstBinding + i].buffer = radv_buffer_from_handle(pBuffers[i]); vb[firstBinding + i].offset = pOffsets[i]; - cmd_buffer->state.vb_dirty |= 1 << (firstBinding + i); } + + cmd_buffer->state.vb_dirty = true; } void radv_CmdBindIndexBuffer( @@ -1701,12 +2171,16 @@ void radv_CmdBindIndexBuffer( VkIndexType indexType) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer); - cmd_buffer->state.index_buffer = radv_buffer_from_handle(buffer); - cmd_buffer->state.index_offset = offset; cmd_buffer->state.index_type = indexType; /* vk matches hw */ + cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo); + cmd_buffer->state.index_va += index_buffer->offset + offset; + + int index_size_shift = cmd_buffer->state.index_type ? 2 : 1; + cmd_buffer->state.max_index_count = (index_buffer->size - offset) >> index_size_shift; cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; - cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, cmd_buffer->state.index_buffer->bo, 8); + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, index_buffer->bo, 8); } @@ -1717,10 +2191,12 @@ void radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys *ws = cmd_buffer->device->ws; cmd_buffer->state.descriptors[idx] = set; - cmd_buffer->state.descriptors_dirty |= (1 << idx); + cmd_buffer->state.descriptors_dirty |= (1u << idx); if (!set) return; + assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); + for (unsigned j = 0; j < set->layout->buffer_count; ++j) if (set->descriptors[j]) ws->cs_add_buffer(cmd_buffer->cs, set->descriptors[j], 7); @@ -1743,16 +2219,13 @@ void radv_CmdBindDescriptorSets( RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); unsigned dyn_idx = 0; - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, - cmd_buffer->cs, MAX_SETS * 4 * 6); - for (unsigned i = 0; i < descriptorSetCount; ++i) { unsigned idx = i + firstSet; RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]); radv_bind_descriptor_set(cmd_buffer, set, idx); for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) { - unsigned idx = j + layout->set[i].dynamic_offset_start; + unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start; uint32_t *dst = cmd_buffer->dynamic_buffers + idx * 4; assert(dyn_idx < dynamicOffsetCount); @@ -1771,8 +2244,117 @@ void radv_CmdBindDescriptorSets( set->layout->dynamic_shader_stages; } } +} + +static bool radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, + struct radv_descriptor_set *set, + struct radv_descriptor_set_layout *layout) +{ + set->size = layout->size; + set->layout = layout; + + if (cmd_buffer->push_descriptors.capacity < set->size) { + size_t new_size = MAX2(set->size, 1024); + new_size = MAX2(new_size, 2 * cmd_buffer->push_descriptors.capacity); + new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS); + + free(set->mapped_ptr); + set->mapped_ptr = malloc(new_size); + + if (!set->mapped_ptr) { + cmd_buffer->push_descriptors.capacity = 0; + cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; + return false; + } + + cmd_buffer->push_descriptors.capacity = new_size; + } + + return true; +} + +void radv_meta_push_descriptor_set( + struct radv_cmd_buffer* cmd_buffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipelineLayout _layout, + uint32_t set, + uint32_t descriptorWriteCount, + const VkWriteDescriptorSet* pDescriptorWrites) +{ + RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); + struct radv_descriptor_set *push_set = &cmd_buffer->meta_push_descriptors; + unsigned bo_offset; + + assert(set == 0); + assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); + + push_set->size = layout->set[set].layout->size; + push_set->layout = layout->set[set].layout; + + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->size, 32, + &bo_offset, + (void**) &push_set->mapped_ptr)) + return; + + push_set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); + push_set->va += bo_offset; + + radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer, + radv_descriptor_set_to_handle(push_set), + descriptorWriteCount, pDescriptorWrites, 0, NULL); + + cmd_buffer->state.descriptors[set] = push_set; + cmd_buffer->state.descriptors_dirty |= (1u << set); +} + +void radv_CmdPushDescriptorSetKHR( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipelineLayout _layout, + uint32_t set, + uint32_t descriptorWriteCount, + const VkWriteDescriptorSet* pDescriptorWrites) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); + struct radv_descriptor_set *push_set = &cmd_buffer->push_descriptors.set; + + assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); + + if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout)) + return; + + radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer, + radv_descriptor_set_to_handle(push_set), + descriptorWriteCount, pDescriptorWrites, 0, NULL); - assert(cmd_buffer->cs->cdw <= cdw_max); + cmd_buffer->state.descriptors[set] = push_set; + cmd_buffer->state.descriptors_dirty |= (1u << set); + cmd_buffer->state.push_descriptors_dirty = true; +} + +void radv_CmdPushDescriptorSetWithTemplateKHR( + VkCommandBuffer commandBuffer, + VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate, + VkPipelineLayout _layout, + uint32_t set, + const void* pData) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); + struct radv_descriptor_set *push_set = &cmd_buffer->push_descriptors.set; + + assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); + + if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout)) + return; + + radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set, + descriptorUpdateTemplate, pData); + + cmd_buffer->state.descriptors[set] = push_set; + cmd_buffer->state.descriptors_dirty |= (1u << set); + cmd_buffer->state.push_descriptors_dirty = true; } void radv_CmdPushConstants(VkCommandBuffer commandBuffer, @@ -1792,13 +2374,16 @@ VkResult radv_EndCommandBuffer( { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) + if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) { + if (cmd_buffer->device->physical_device->rad_info.chip_class == SI) + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2; si_emit_cache_flush(cmd_buffer); + } - if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) || - cmd_buffer->record_fail) + if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs)) return VK_ERROR_OUT_OF_DEVICE_MEMORY; - return VK_SUCCESS; + + return cmd_buffer->record_result; } static void @@ -1815,9 +2400,10 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->state.emitted_compute_pipeline = pipeline; compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; - va = ws->buffer_get_va(compute_shader->bo); + va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset; ws->cs_add_buffer(cmd_buffer->cs, compute_shader->bo, 8); + radv_emit_prefetch(cmd_buffer, va, compute_shader->code_size); MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 16); @@ -1849,8 +2435,16 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2])); assert(cmd_buffer->cs->cdw <= cdw_max); + radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE); } +static void radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer) +{ + for (unsigned i = 0; i < MAX_SETS; i++) { + if (cmd_buffer->state.descriptors[i]) + cmd_buffer->state.descriptors_dirty |= (1u << i); + } +} void radv_CmdBindPipeline( VkCommandBuffer commandBuffer, @@ -1860,10 +2454,7 @@ void radv_CmdBindPipeline( RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline); - for (unsigned i = 0; i < MAX_SETS; i++) { - if (cmd_buffer->state.descriptors[i]) - cmd_buffer->state.descriptors_dirty |= (1 << i); - } + radv_mark_descriptor_sets_dirty(cmd_buffer); switch (pipelineBindPoint) { case VK_PIPELINE_BIND_POINT_COMPUTE: @@ -1872,7 +2463,9 @@ void radv_CmdBindPipeline( break; case VK_PIPELINE_BIND_POINT_GRAPHICS: cmd_buffer->state.pipeline = pipeline; - cmd_buffer->state.vertex_descriptors_dirty = true; + if (!pipeline) + break; + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE; cmd_buffer->push_constant_stages |= pipeline->active_stages; @@ -1887,13 +2480,16 @@ void radv_CmdBindPipeline( if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed) cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size; + if (radv_pipeline_has_tess(pipeline)) + cmd_buffer->tess_rings_needed = true; + if (radv_pipeline_has_gs(pipeline)) { struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY, AC_UD_SCRATCH_RING_OFFSETS); if (cmd_buffer->ring_offsets_idx == -1) cmd_buffer->ring_offsets_idx = loc->sgpr_idx; else if (loc->sgpr_idx != -1) - assert(loc->sgpr_idx != cmd_buffer->ring_offsets_idx); + assert(loc->sgpr_idx == cmd_buffer->ring_offsets_idx); } break; default: @@ -1909,10 +2505,10 @@ void radv_CmdSetViewport( const VkViewport* pViewports) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - const uint32_t total_count = firstViewport + viewportCount; - if (cmd_buffer->state.dynamic.viewport.count < total_count) - cmd_buffer->state.dynamic.viewport.count = total_count; + + assert(firstViewport < MAX_VIEWPORTS); + assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); memcpy(cmd_buffer->state.dynamic.viewport.viewports + firstViewport, pViewports, viewportCount * sizeof(*pViewports)); @@ -1927,10 +2523,10 @@ void radv_CmdSetScissor( const VkRect2D* pScissors) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - const uint32_t total_count = firstScissor + scissorCount; - if (cmd_buffer->state.dynamic.scissor.count < total_count) - cmd_buffer->state.dynamic.scissor.count = total_count; + + assert(firstScissor < MAX_SCISSORS); + assert(total_count >= 1 && total_count <= MAX_SCISSORS); memcpy(cmd_buffer->state.dynamic.scissor.scissors + firstScissor, pScissors, scissorCount * sizeof(*pScissors)); @@ -2031,7 +2627,6 @@ void radv_CmdSetStencilReference( cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE; } - void radv_CmdExecuteCommands( VkCommandBuffer commandBuffer, uint32_t commandBufferCount, @@ -2039,6 +2634,9 @@ void radv_CmdExecuteCommands( { RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer); + /* Emit pending flushes on primary prior to executing secondary */ + si_emit_cache_flush(primary); + for (uint32_t i = 0; i < commandBufferCount; i++) { RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]); @@ -2051,6 +2649,10 @@ void radv_CmdExecuteCommands( primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed; if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed) primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed; + if (secondary->tess_rings_needed) + primary->tess_rings_needed = true; + if (secondary->sample_positions_needed) + primary->sample_positions_needed = true; if (secondary->ring_offsets_idx != -1) { if (primary->ring_offsets_idx == -1) @@ -2059,13 +2661,18 @@ void radv_CmdExecuteCommands( assert(secondary->ring_offsets_idx == primary->ring_offsets_idx); } primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs); + + primary->state.emitted_pipeline = secondary->state.emitted_pipeline; + primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline; + primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en; + primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index; } - /* if we execute secondary we need to re-emit out pipelines */ + /* if we execute secondary we need to mark some stuff to reset dirty */ if (commandBufferCount) { - primary->state.emitted_pipeline = NULL; primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE; primary->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_ALL; + radv_mark_descriptor_sets_dirty(primary); } } @@ -2089,6 +2696,7 @@ VkResult radv_CreateCommandPool( pool->alloc = device->alloc; list_inithead(&pool->cmd_buffers); + list_inithead(&pool->free_cmd_buffers); pool->queue_family_index = pCreateInfo->queueFamilyIndex; @@ -2114,6 +2722,11 @@ void radv_DestroyCommandPool( radv_cmd_buffer_destroy(cmd_buffer); } + list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, + &pool->free_cmd_buffers, pool_link) { + radv_cmd_buffer_destroy(cmd_buffer); + } + vk_free2(&device->alloc, pAllocator, pool); } @@ -2123,10 +2736,13 @@ VkResult radv_ResetCommandPool( VkCommandPoolResetFlags flags) { RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); + VkResult result; list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link) { - radv_reset_cmd_buffer(cmd_buffer); + result = radv_reset_cmd_buffer(cmd_buffer); + if (result != VK_SUCCESS) + return result; } return VK_SUCCESS; @@ -2137,6 +2753,15 @@ void radv_TrimCommandPoolKHR( VkCommandPool commandPool, VkCommandPoolTrimFlagsKHR flags) { + RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); + + if (!pool) + return; + + list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, + &pool->free_cmd_buffers, pool_link) { + radv_cmd_buffer_destroy(cmd_buffer); + } } void radv_CmdBeginRenderPass( @@ -2150,13 +2775,15 @@ void radv_CmdBeginRenderPass( MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 2048); + MAYBE_UNUSED VkResult result; cmd_buffer->state.framebuffer = framebuffer; cmd_buffer->state.pass = pass; cmd_buffer->state.render_area = pRenderPassBegin->renderArea; - radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin); - si_emit_cache_flush(cmd_buffer); + result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin); + if (result != VK_SUCCESS) + return; radv_cmd_buffer_set_subpass(cmd_buffer, pass->subpasses, true); assert(cmd_buffer->cs->cdw <= cdw_max); @@ -2170,7 +2797,6 @@ void radv_CmdNextSubpass( { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - si_emit_cache_flush(cmd_buffer); radv_cmd_buffer_resolve_subpass(cmd_buffer); radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, @@ -2180,6 +2806,38 @@ void radv_CmdNextSubpass( radv_cmd_buffer_clear_subpass(cmd_buffer); } +static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) { + if (!pipeline->shaders[stage]) + continue; + struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX); + if (loc->sgpr_idx == -1) + continue; + uint32_t base_reg = radv_shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index); + + } + if (pipeline->gs_copy_shader) { + struct ac_userdata_info *loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX]; + if (loc->sgpr_idx != -1) { + uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0; + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index); + } + } +} + +static void +radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, + uint32_t vertex_count) +{ + radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating)); + radeon_emit(cmd_buffer->cs, vertex_count); + radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | + S_0287F0_USE_OPAQUE(0)); +} + void radv_CmdDraw( VkCommandBuffer commandBuffer, uint32_t vertexCount, @@ -2188,42 +2846,50 @@ void radv_CmdDraw( uint32_t firstInstance) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - radv_cmd_buffer_flush_state(cmd_buffer); - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 10); + radv_cmd_buffer_flush_state(cmd_buffer, false, (instanceCount > 1), false, vertexCount); - struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX, - AC_UD_VS_BASE_VERTEX_START_INSTANCE); - if (loc->sgpr_idx != -1) { - uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(cmd_buffer->state.pipeline)); - radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 3); - radeon_emit(cmd_buffer->cs, firstVertex); - radeon_emit(cmd_buffer->cs, firstInstance); + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 20 * MAX_VIEWS); + + assert(cmd_buffer->state.pipeline->graphics.vtx_base_sgpr); + radeon_set_sh_reg_seq(cmd_buffer->cs, cmd_buffer->state.pipeline->graphics.vtx_base_sgpr, + cmd_buffer->state.pipeline->graphics.vtx_emit_num); + radeon_emit(cmd_buffer->cs, firstVertex); + radeon_emit(cmd_buffer->cs, firstInstance); + if (cmd_buffer->state.pipeline->graphics.vtx_emit_num == 3) radeon_emit(cmd_buffer->cs, 0); - } - radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, 0)); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, cmd_buffer->state.predicating)); radeon_emit(cmd_buffer->cs, instanceCount); - radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, 0)); - radeon_emit(cmd_buffer->cs, vertexCount); - radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | - S_0287F0_USE_OPAQUE(0)); + if (!cmd_buffer->state.subpass->view_mask) { + radv_cs_emit_draw_packet(cmd_buffer, vertexCount); + } else { + unsigned i; + for_each_bit(i, cmd_buffer->state.subpass->view_mask) { + radv_emit_view_index(cmd_buffer, i); + + radv_cs_emit_draw_packet(cmd_buffer, vertexCount); + } + } assert(cmd_buffer->cs->cdw <= cdw_max); - radv_cmd_buffer_trace_emit(cmd_buffer); + radv_cmd_buffer_after_draw(cmd_buffer); } -static void radv_emit_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer) -{ - uint32_t primitive_reset_index = cmd_buffer->state.last_primitive_reset_index ? 0xffffffffu : 0xffffu; - if (cmd_buffer->state.pipeline->graphics.prim_restart_enable && - primitive_reset_index != cmd_buffer->state.last_primitive_reset_index) { - cmd_buffer->state.last_primitive_reset_index = primitive_reset_index; - radeon_set_context_reg(cmd_buffer->cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, - primitive_reset_index); - } +static void +radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, + uint64_t index_va, + uint32_t index_count) +{ + radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, false)); + radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count); + radeon_emit(cmd_buffer->cs, index_va); + radeon_emit(cmd_buffer->cs, (index_va >> 32UL) & 0xFF); + radeon_emit(cmd_buffer->cs, index_count); + radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA); } void radv_CmdDrawIndexed( @@ -2236,40 +2902,85 @@ void radv_CmdDrawIndexed( { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); int index_size = cmd_buffer->state.index_type ? 4 : 2; - uint32_t index_max_size = (cmd_buffer->state.index_buffer->size - cmd_buffer->state.index_offset) / index_size; uint64_t index_va; - radv_cmd_buffer_flush_state(cmd_buffer); - radv_emit_primitive_reset_index(cmd_buffer); + radv_cmd_buffer_flush_state(cmd_buffer, true, (instanceCount > 1), false, indexCount); - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15); + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 26 * MAX_VIEWS); - radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); - radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type); + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_03090C_VGT_INDEX_TYPE, + 2, cmd_buffer->state.index_type); + } else { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); + radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type); + } - struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX, - AC_UD_VS_BASE_VERTEX_START_INSTANCE); - if (loc->sgpr_idx != -1) { - uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(cmd_buffer->state.pipeline)); - radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 3); - radeon_emit(cmd_buffer->cs, vertexOffset); - radeon_emit(cmd_buffer->cs, firstInstance); + assert(cmd_buffer->state.pipeline->graphics.vtx_base_sgpr); + radeon_set_sh_reg_seq(cmd_buffer->cs, cmd_buffer->state.pipeline->graphics.vtx_base_sgpr, + cmd_buffer->state.pipeline->graphics.vtx_emit_num); + radeon_emit(cmd_buffer->cs, vertexOffset); + radeon_emit(cmd_buffer->cs, firstInstance); + if (cmd_buffer->state.pipeline->graphics.vtx_emit_num == 3) radeon_emit(cmd_buffer->cs, 0); - } + radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, 0)); radeon_emit(cmd_buffer->cs, instanceCount); - index_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->state.index_buffer->bo); - index_va += firstIndex * index_size + cmd_buffer->state.index_buffer->offset + cmd_buffer->state.index_offset; - radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, false)); - radeon_emit(cmd_buffer->cs, index_max_size); - radeon_emit(cmd_buffer->cs, index_va); - radeon_emit(cmd_buffer->cs, (index_va >> 32UL) & 0xFF); - radeon_emit(cmd_buffer->cs, indexCount); - radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA); + index_va = cmd_buffer->state.index_va; + index_va += firstIndex * index_size; + if (!cmd_buffer->state.subpass->view_mask) { + radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, indexCount); + } else { + unsigned i; + for_each_bit(i, cmd_buffer->state.subpass->view_mask) { + radv_emit_view_index(cmd_buffer, i); + + radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, indexCount); + } + } assert(cmd_buffer->cs->cdw <= cdw_max); - radv_cmd_buffer_trace_emit(cmd_buffer); + radv_cmd_buffer_after_draw(cmd_buffer); +} + +static void +radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, + bool indexed, + uint32_t draw_count, + uint64_t count_va, + uint32_t stride) +{ + struct radeon_winsys_cs *cs = cmd_buffer->cs; + unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA + : V_0287F0_DI_SRC_SEL_AUTO_INDEX; + bool draw_id_enable = cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX]->info.info.vs.needs_draw_id; + uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr; + assert(base_reg); + + if (draw_count == 1 && !count_va && !draw_id_enable) { + radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : + PKT3_DRAW_INDIRECT, 3, false)); + radeon_emit(cs, 0); + radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, di_src_sel); + } else { + radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : + PKT3_DRAW_INDIRECT_MULTI, + 8, false)); + radeon_emit(cs, 0); + radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, (((base_reg + 8) - SI_SH_REG_OFFSET) >> 2) | + S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) | + S_2C3_COUNT_INDIRECT_ENABLE(!!count_va)); + radeon_emit(cs, draw_count); /* count */ + radeon_emit(cs, count_va); /* count_addr */ + radeon_emit(cs, count_va >> 32); + radeon_emit(cs, stride); /* stride */ + radeon_emit(cs, di_src_sel); + } } static void @@ -2285,14 +2996,13 @@ radv_emit_indirect_draw(struct radv_cmd_buffer *cmd_buffer, RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); RADV_FROM_HANDLE(radv_buffer, count_buffer, _count_buffer); struct radeon_winsys_cs *cs = cmd_buffer->cs; - unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA - : V_0287F0_DI_SRC_SEL_AUTO_INDEX; - uint64_t indirect_va = cmd_buffer->device->ws->buffer_get_va(buffer->bo); + + uint64_t indirect_va = radv_buffer_get_va(buffer->bo); indirect_va += offset + buffer->offset; uint64_t count_va = 0; if (count_buffer) { - count_va = cmd_buffer->device->ws->buffer_get_va(count_buffer->bo); + count_va = radv_buffer_get_va(count_buffer->bo); count_va += count_offset + count_buffer->offset; } @@ -2301,30 +3011,22 @@ radv_emit_indirect_draw(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->device->ws->cs_add_buffer(cs, buffer->bo, 8); - struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX, - AC_UD_VS_BASE_VERTEX_START_INSTANCE); - uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(cmd_buffer->state.pipeline)); - assert(loc->sgpr_idx != -1); radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0)); radeon_emit(cs, 1); radeon_emit(cs, indirect_va); radeon_emit(cs, indirect_va >> 32); - radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : - PKT3_DRAW_INDIRECT_MULTI, - 8, false)); - radeon_emit(cs, 0); - radeon_emit(cs, ((base_reg + loc->sgpr_idx * 4) - SI_SH_REG_OFFSET) >> 2); - radeon_emit(cs, ((base_reg + (loc->sgpr_idx + 1) * 4) - SI_SH_REG_OFFSET) >> 2); - radeon_emit(cs, (((base_reg + (loc->sgpr_idx + 2) * 4) - SI_SH_REG_OFFSET) >> 2) | - S_2C3_DRAW_INDEX_ENABLE(1) | - S_2C3_COUNT_INDIRECT_ENABLE(!!count_va)); - radeon_emit(cs, draw_count); /* count */ - radeon_emit(cs, count_va); /* count_addr */ - radeon_emit(cs, count_va >> 32); - radeon_emit(cs, stride); /* stride */ - radeon_emit(cs, di_src_sel); - radv_cmd_buffer_trace_emit(cmd_buffer); + if (!cmd_buffer->state.subpass->view_mask) { + radv_cs_emit_indirect_draw_packet(cmd_buffer, indexed, draw_count, count_va, stride); + } else { + unsigned i; + for_each_bit(i, cmd_buffer->state.subpass->view_mask) { + radv_emit_view_index(cmd_buffer, i); + + radv_cs_emit_indirect_draw_packet(cmd_buffer, indexed, draw_count, count_va, stride); + } + } + radv_cmd_buffer_after_draw(cmd_buffer); } static void @@ -2337,10 +3039,10 @@ radv_cmd_draw_indirect_count(VkCommandBuffer command uint32_t stride) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - radv_cmd_buffer_flush_state(cmd_buffer); + radv_cmd_buffer_flush_state(cmd_buffer, false, false, true, 0); MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, - cmd_buffer->cs, 14); + cmd_buffer->cs, 24 * MAX_VIEWS); radv_emit_indirect_draw(cmd_buffer, buffer, offset, countBuffer, countBufferOffset, maxDrawCount, stride, false); @@ -2359,16 +3061,12 @@ radv_cmd_draw_indexed_indirect_count( uint32_t stride) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - int index_size = cmd_buffer->state.index_type ? 4 : 2; - uint32_t index_max_size = (cmd_buffer->state.index_buffer->size - cmd_buffer->state.index_offset) / index_size; uint64_t index_va; - radv_cmd_buffer_flush_state(cmd_buffer); - radv_emit_primitive_reset_index(cmd_buffer); + radv_cmd_buffer_flush_state(cmd_buffer, true, false, true, 0); - index_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->state.index_buffer->bo); - index_va += cmd_buffer->state.index_buffer->offset + cmd_buffer->state.index_offset; + index_va = cmd_buffer->state.index_va; - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 21); + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 31 * MAX_VIEWS); radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type); @@ -2378,7 +3076,7 @@ radv_cmd_draw_indexed_indirect_count( radeon_emit(cmd_buffer->cs, index_va >> 32); radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); - radeon_emit(cmd_buffer->cs, index_max_size); + radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count); radv_emit_indirect_draw(cmd_buffer, buffer, offset, countBuffer, countBufferOffset, maxDrawCount, stride, true); @@ -2436,16 +3134,155 @@ void radv_CmdDrawIndexedIndirectCountAMD( maxDrawCount, stride); } +struct radv_dispatch_info { + /** + * Determine the layout of the grid (in block units) to be used. + */ + uint32_t blocks[3]; + + /** + * Whether it's an unaligned compute dispatch. + */ + bool unaligned; + + /** + * Indirect compute parameters resource. + */ + struct radv_buffer *indirect; + uint64_t indirect_offset; +}; + +static void +radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, + const struct radv_dispatch_info *info) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; + struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; + struct radeon_winsys *ws = cmd_buffer->device->ws; + struct radeon_winsys_cs *cs = cmd_buffer->cs; + struct ac_userdata_info *loc; + uint8_t grid_used; + + grid_used = compute_shader->info.info.cs.grid_components_used; + + loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, + AC_UD_CS_GRID_SIZE); + + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(ws, cs, 25); + + if (info->indirect) { + uint64_t va = radv_buffer_get_va(info->indirect->bo); + + va += info->indirect->offset + info->indirect_offset; + + ws->cs_add_buffer(cs, info->indirect->bo, 8); + + if (loc->sgpr_idx != -1) { + for (unsigned i = 0; i < grid_used; ++i) { + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_REG)); + radeon_emit(cs, (va + 4 * i)); + radeon_emit(cs, (va + 4 * i) >> 32); + radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 + + loc->sgpr_idx * 4) >> 2) + i); + radeon_emit(cs, 0); + } + } + + if (radv_cmd_buffer_uses_mec(cmd_buffer)) { + radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, 1); + } else { + radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, 1); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + + radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, 0); + radeon_emit(cs, 1); + } + } else { + unsigned blocks[3] = { info->blocks[0], info->blocks[1], info->blocks[2] }; + unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1); + + if (info->unaligned) { + unsigned *cs_block_size = compute_shader->info.cs.block_size; + unsigned remainder[3]; + + /* If aligned, these should be an entire block size, + * not 0. + */ + remainder[0] = blocks[0] + cs_block_size[0] - + align_u32_npot(blocks[0], cs_block_size[0]); + remainder[1] = blocks[1] + cs_block_size[1] - + align_u32_npot(blocks[1], cs_block_size[1]); + remainder[2] = blocks[2] + cs_block_size[2] - + align_u32_npot(blocks[2], cs_block_size[2]); + + blocks[0] = round_up_u32(blocks[0], cs_block_size[0]); + blocks[1] = round_up_u32(blocks[1], cs_block_size[1]); + blocks[2] = round_up_u32(blocks[2], cs_block_size[2]); + + radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); + radeon_emit(cs, + S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) | + S_00B81C_NUM_THREAD_PARTIAL(remainder[0])); + radeon_emit(cs, + S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) | + S_00B81C_NUM_THREAD_PARTIAL(remainder[1])); + radeon_emit(cs, + S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) | + S_00B81C_NUM_THREAD_PARTIAL(remainder[2])); + + dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); + } + + if (loc->sgpr_idx != -1) { + assert(!loc->indirect); + assert(loc->num_sgprs == grid_used); + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + + loc->sgpr_idx * 4, grid_used); + radeon_emit(cs, blocks[0]); + if (grid_used > 1) + radeon_emit(cs, blocks[1]); + if (grid_used > 2) + radeon_emit(cs, blocks[2]); + } + + radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, blocks[0]); + radeon_emit(cs, blocks[1]); + radeon_emit(cs, blocks[2]); + radeon_emit(cs, dispatch_initiator); + } + + assert(cmd_buffer->cs->cdw <= cdw_max); +} + static void -radv_flush_compute_state(struct radv_cmd_buffer *cmd_buffer) +radv_dispatch(struct radv_cmd_buffer *cmd_buffer, + const struct radv_dispatch_info *info) { - cmd_buffer->no_draws = false; radv_emit_compute_pipeline(cmd_buffer); - radv_flush_descriptors(cmd_buffer, cmd_buffer->state.compute_pipeline, - VK_SHADER_STAGE_COMPUTE_BIT); + + radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT); radv_flush_constants(cmd_buffer, cmd_buffer->state.compute_pipeline, VK_SHADER_STAGE_COMPUTE_BIT); + si_emit_cache_flush(cmd_buffer); + + radv_emit_dispatch_packets(cmd_buffer, info); + + radv_cmd_buffer_after_draw(cmd_buffer); } void radv_CmdDispatch( @@ -2455,31 +3292,13 @@ void radv_CmdDispatch( uint32_t z) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_dispatch_info info = {}; - radv_flush_compute_state(cmd_buffer); + info.blocks[0] = x; + info.blocks[1] = y; + info.blocks[2] = z; - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 10); - - struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline, - MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE); - if (loc->sgpr_idx != -1) { - assert(!loc->indirect); - assert(loc->num_sgprs == 3); - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3); - radeon_emit(cmd_buffer->cs, x); - radeon_emit(cmd_buffer->cs, y); - radeon_emit(cmd_buffer->cs, z); - } - - radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cmd_buffer->cs, x); - radeon_emit(cmd_buffer->cs, y); - radeon_emit(cmd_buffer->cs, z); - radeon_emit(cmd_buffer->cs, 1); - - assert(cmd_buffer->cs->cdw <= cdw_max); - radv_cmd_buffer_trace_emit(cmd_buffer); + radv_dispatch(cmd_buffer, &info); } void radv_CmdDispatchIndirect( @@ -2489,49 +3308,12 @@ void radv_CmdDispatchIndirect( { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); - uint64_t va = cmd_buffer->device->ws->buffer_get_va(buffer->bo); - va += buffer->offset + offset; - - cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, buffer->bo, 8); - - radv_flush_compute_state(cmd_buffer); - - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 25); - struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline, - MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE); - if (loc->sgpr_idx != -1) { - for (unsigned i = 0; i < 3; ++i) { - radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | - COPY_DATA_DST_SEL(COPY_DATA_REG)); - radeon_emit(cmd_buffer->cs, (va + 4 * i)); - radeon_emit(cmd_buffer->cs, (va + 4 * i) >> 32); - radeon_emit(cmd_buffer->cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i); - radeon_emit(cmd_buffer->cs, 0); - } - } - - if (radv_cmd_buffer_uses_mec(cmd_buffer)) { - radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cmd_buffer->cs, va); - radeon_emit(cmd_buffer->cs, va >> 32); - radeon_emit(cmd_buffer->cs, 1); - } else { - radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_BASE, 2, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cmd_buffer->cs, 1); - radeon_emit(cmd_buffer->cs, va); - radeon_emit(cmd_buffer->cs, va >> 32); + struct radv_dispatch_info info = {}; - radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cmd_buffer->cs, 0); - radeon_emit(cmd_buffer->cs, 1); - } + info.indirect = buffer; + info.indirect_offset = offset; - assert(cmd_buffer->cs->cdw <= cdw_max); - radv_cmd_buffer_trace_emit(cmd_buffer); + radv_dispatch(cmd_buffer, &info); } void radv_unaligned_dispatch( @@ -2540,52 +3322,14 @@ void radv_unaligned_dispatch( uint32_t y, uint32_t z) { - struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; - struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; - uint32_t blocks[3], remainder[3]; - - blocks[0] = round_up_u32(x, compute_shader->info.cs.block_size[0]); - blocks[1] = round_up_u32(y, compute_shader->info.cs.block_size[1]); - blocks[2] = round_up_u32(z, compute_shader->info.cs.block_size[2]); - - /* If aligned, these should be an entire block size, not 0 */ - remainder[0] = x + compute_shader->info.cs.block_size[0] - align_u32_npot(x, compute_shader->info.cs.block_size[0]); - remainder[1] = y + compute_shader->info.cs.block_size[1] - align_u32_npot(y, compute_shader->info.cs.block_size[1]); - remainder[2] = z + compute_shader->info.cs.block_size[2] - align_u32_npot(z, compute_shader->info.cs.block_size[2]); - - radv_flush_compute_state(cmd_buffer); + struct radv_dispatch_info info = {}; - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15); + info.blocks[0] = x; + info.blocks[1] = y; + info.blocks[2] = z; + info.unaligned = 1; - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); - radeon_emit(cmd_buffer->cs, - S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]) | - S_00B81C_NUM_THREAD_PARTIAL(remainder[0])); - radeon_emit(cmd_buffer->cs, - S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]) | - S_00B81C_NUM_THREAD_PARTIAL(remainder[1])); - radeon_emit(cmd_buffer->cs, - S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]) | - S_00B81C_NUM_THREAD_PARTIAL(remainder[2])); - - struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline, - MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE); - if (loc->sgpr_idx != -1) { - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3); - radeon_emit(cmd_buffer->cs, blocks[0]); - radeon_emit(cmd_buffer->cs, blocks[1]); - radeon_emit(cmd_buffer->cs, blocks[2]); - } - radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cmd_buffer->cs, blocks[0]); - radeon_emit(cmd_buffer->cs, blocks[1]); - radeon_emit(cmd_buffer->cs, blocks[2]); - radeon_emit(cmd_buffer->cs, S_00B800_COMPUTE_SHADER_EN(1) | - S_00B800_PARTIAL_TG_EN(1)); - - assert(cmd_buffer->cs->cdw <= cdw_max); - radv_cmd_buffer_trace_emit(cmd_buffer); + radv_dispatch(cmd_buffer, &info); } void radv_CmdEndRenderPass( @@ -2595,7 +3339,6 @@ void radv_CmdEndRenderPass( radv_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier); - si_emit_cache_flush(cmd_buffer); radv_cmd_buffer_resolve_subpass(cmd_buffer); for (unsigned i = 0; i < cmd_buffer->state.framebuffer->attachment_count; ++i) { @@ -2612,54 +3355,72 @@ void radv_CmdEndRenderPass( cmd_buffer->state.framebuffer = NULL; } - +/* + * For HTILE we have the following interesting clear words: + * 0x0000030f: Uncompressed. + * 0xfffffff0: Clear depth to 1.0 + * 0x00000000: Clear depth to 0.0 + */ static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, - struct radv_image *image) + struct radv_image *image, + const VkImageSubresourceRange *range, + uint32_t clear_word) { + assert(range->baseMipLevel == 0); + assert(range->levelCount == 1 || range->levelCount == VK_REMAINING_ARRAY_LAYERS); + unsigned layer_count = radv_get_layerCount(image, range); + uint64_t size = image->surface.htile_slice_size * layer_count; + uint64_t offset = image->offset + image->htile_offset + + image->surface.htile_slice_size * range->baseArrayLayer; cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; - radv_fill_buffer(cmd_buffer, image->bo, image->offset + image->htile.offset, - image->htile.size, 0xffffffff); + radv_fill_buffer(cmd_buffer, image->bo, offset, size, clear_word); cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VMEM_L1 | - RADV_CMD_FLAG_INV_GLOBAL_L2; + RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2; } static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout, VkImageLayout dst_layout, - VkImageSubresourceRange range, + unsigned src_queue_mask, + unsigned dst_queue_mask, + const VkImageSubresourceRange *range, VkImageAspectFlags pending_clears) { if (dst_layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL && (pending_clears & vk_format_aspects(image->vk_format)) == vk_format_aspects(image->vk_format) && cmd_buffer->state.render_area.offset.x == 0 && cmd_buffer->state.render_area.offset.y == 0 && - cmd_buffer->state.render_area.extent.width == image->extent.width && - cmd_buffer->state.render_area.extent.height == image->extent.height) { + cmd_buffer->state.render_area.extent.width == image->info.width && + cmd_buffer->state.render_area.extent.height == image->info.height) { /* The clear will initialize htile. */ return; } else if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED && - radv_layout_has_htile(image, dst_layout)) { + radv_layout_has_htile(image, dst_layout, dst_queue_mask)) { /* TODO: merge with the clear if applicable */ - radv_initialize_htile(cmd_buffer, image); - } else if (!radv_layout_has_htile(image, src_layout) && - radv_layout_has_htile(image, dst_layout)) { - radv_initialize_htile(cmd_buffer, image); - } else if ((radv_layout_has_htile(image, src_layout) && - !radv_layout_has_htile(image, dst_layout)) || - (radv_layout_is_htile_compressed(image, src_layout) && - !radv_layout_is_htile_compressed(image, dst_layout))) { + radv_initialize_htile(cmd_buffer, image, range, 0); + } else if (!radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) && + radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) { + radv_initialize_htile(cmd_buffer, image, range, 0xffffffff); + } else if (radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) && + !radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) { + VkImageSubresourceRange local_range = *range; + local_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; + local_range.baseMipLevel = 0; + local_range.levelCount = 1; + + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | + RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; - range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; - range.baseMipLevel = 0; - range.levelCount = 1; + radv_decompress_depth_image_inplace(cmd_buffer, image, &local_range); - radv_decompress_depth_image_inplace(cmd_buffer, image, &range); + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | + RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; } } @@ -2675,7 +3436,7 @@ void radv_initialise_cmask(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VMEM_L1 | - RADV_CMD_FLAG_INV_GLOBAL_L2; + RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2; } static void radv_handle_cmask_image_transition(struct radv_cmd_buffer *cmd_buffer, @@ -2684,7 +3445,7 @@ static void radv_handle_cmask_image_transition(struct radv_cmd_buffer *cmd_buffe VkImageLayout dst_layout, unsigned src_queue_mask, unsigned dst_queue_mask, - VkImageSubresourceRange range, + const VkImageSubresourceRange *range, VkImageAspectFlags pending_clears) { if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { @@ -2694,7 +3455,7 @@ static void radv_handle_cmask_image_transition(struct radv_cmd_buffer *cmd_buffe radv_initialise_cmask(cmd_buffer, image, 0xffffffffu); } else if (radv_layout_can_fast_clear(image, src_layout, src_queue_mask) && !radv_layout_can_fast_clear(image, dst_layout, dst_queue_mask)) { - radv_fast_clear_flush_image_inplace(cmd_buffer, image); + radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); } } @@ -2712,7 +3473,7 @@ void radv_initialize_dcc(struct radv_cmd_buffer *cmd_buffer, RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VMEM_L1 | - RADV_CMD_FLAG_INV_GLOBAL_L2; + RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2; } static void radv_handle_dcc_image_transition(struct radv_cmd_buffer *cmd_buffer, @@ -2721,14 +3482,14 @@ static void radv_handle_dcc_image_transition(struct radv_cmd_buffer *cmd_buffer, VkImageLayout dst_layout, unsigned src_queue_mask, unsigned dst_queue_mask, - VkImageSubresourceRange range, + const VkImageSubresourceRange *range, VkImageAspectFlags pending_clears) { if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { radv_initialize_dcc(cmd_buffer, image, 0x20202020u); } else if (radv_layout_can_fast_clear(image, src_layout, src_queue_mask) && !radv_layout_can_fast_clear(image, dst_layout, dst_queue_mask)) { - radv_fast_clear_flush_image_inplace(cmd_buffer, image); + radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); } } @@ -2738,7 +3499,7 @@ static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, VkImageLayout dst_layout, uint32_t src_family, uint32_t dst_family, - VkImageSubresourceRange range, + const VkImageSubresourceRange *range, VkImageAspectFlags pending_clears) { if (image->exclusive && src_family != dst_family) { @@ -2761,9 +3522,11 @@ static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, unsigned src_queue_mask = radv_image_queue_family_mask(image, src_family, cmd_buffer->queue_family_index); unsigned dst_queue_mask = radv_image_queue_family_mask(image, dst_family, cmd_buffer->queue_family_index); - if (image->htile.size) + if (image->surface.htile_size) radv_handle_depth_image_transition(cmd_buffer, image, src_layout, - dst_layout, range, pending_clears); + dst_layout, src_queue_mask, + dst_queue_mask, range, + pending_clears); if (image->cmask.size) radv_handle_cmask_image_transition(cmd_buffer, image, src_layout, @@ -2791,44 +3554,30 @@ void radv_CmdPipelineBarrier( const VkImageMemoryBarrier* pImageMemoryBarriers) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - VkAccessFlags src_flags = 0; - VkAccessFlags dst_flags = 0; - uint32_t b; + enum radv_cmd_flush_bits src_flush_bits = 0; + enum radv_cmd_flush_bits dst_flush_bits = 0; + for (uint32_t i = 0; i < memoryBarrierCount; i++) { - src_flags |= pMemoryBarriers[i].srcAccessMask; - dst_flags |= pMemoryBarriers[i].dstAccessMask; + src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask); + dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask, + NULL); } for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) { - src_flags |= pBufferMemoryBarriers[i].srcAccessMask; - dst_flags |= pBufferMemoryBarriers[i].dstAccessMask; + src_flush_bits |= radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask); + dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask, + NULL); } for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { - src_flags |= pImageMemoryBarriers[i].srcAccessMask; - dst_flags |= pImageMemoryBarriers[i].dstAccessMask; + RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image); + src_flush_bits |= radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask); + dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask, + image); } - enum radv_cmd_flush_bits flush_bits = 0; - for_each_bit(b, src_flags) { - switch ((VkAccessFlagBits)(1 << b)) { - case VK_ACCESS_SHADER_WRITE_BIT: - flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2; - break; - case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: - flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; - break; - case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: - flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; - break; - case VK_ACCESS_TRANSFER_WRITE_BIT: - flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; - break; - default: - break; - } - } - cmd_buffer->state.flush_bits |= flush_bits; + radv_stage_flush(cmd_buffer, srcStageMask); + cmd_buffer->state.flush_bits |= src_flush_bits; for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image); @@ -2837,39 +3586,11 @@ void radv_CmdPipelineBarrier( pImageMemoryBarriers[i].newLayout, pImageMemoryBarriers[i].srcQueueFamilyIndex, pImageMemoryBarriers[i].dstQueueFamilyIndex, - pImageMemoryBarriers[i].subresourceRange, + &pImageMemoryBarriers[i].subresourceRange, 0); } - flush_bits = 0; - - for_each_bit(b, dst_flags) { - switch ((VkAccessFlagBits)(1 << b)) { - case VK_ACCESS_INDIRECT_COMMAND_READ_BIT: - case VK_ACCESS_INDEX_READ_BIT: - case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT: - flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1; - break; - case VK_ACCESS_UNIFORM_READ_BIT: - flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 | RADV_CMD_FLAG_INV_SMEM_L1; - break; - case VK_ACCESS_SHADER_READ_BIT: - flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2; - break; - case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT: - case VK_ACCESS_TRANSFER_READ_BIT: - case VK_ACCESS_TRANSFER_WRITE_BIT: - case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT: - flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER | RADV_CMD_FLAG_INV_GLOBAL_L2; - default: - break; - } - } - - flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | - RADV_CMD_FLAG_PS_PARTIAL_FLUSH; - - cmd_buffer->state.flush_bits |= flush_bits; + cmd_buffer->state.flush_bits |= dst_flush_bits; } @@ -2879,32 +3600,21 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer, unsigned value) { struct radeon_winsys_cs *cs = cmd_buffer->cs; - uint64_t va = cmd_buffer->device->ws->buffer_get_va(event->bo); + uint64_t va = radv_buffer_get_va(event->bo); cmd_buffer->device->ws->cs_add_buffer(cs, event->bo, 8); - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 12); + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 18); /* TODO: this is overkill. Probably should figure something out from * the stage mask. */ - if (cmd_buffer->device->physical_device->rad_info.chip_class == CIK) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | - EVENT_INDEX(5)); - radeon_emit(cs, va); - radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1)); - radeon_emit(cs, 2); - radeon_emit(cs, 0); - } - - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | - EVENT_INDEX(5)); - radeon_emit(cs, va); - radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1)); - radeon_emit(cs, value); - radeon_emit(cs, 0); + si_cs_emit_write_event_eop(cs, + cmd_buffer->state.predicating, + cmd_buffer->device->physical_device->rad_info.chip_class, + false, + EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, + 1, va, 2, value); assert(cmd_buffer->cs->cdw <= cdw_max); } @@ -2946,20 +3656,13 @@ void radv_CmdWaitEvents(VkCommandBuffer commandBuffer, for (unsigned i = 0; i < eventCount; ++i) { RADV_FROM_HANDLE(radv_event, event, pEvents[i]); - uint64_t va = cmd_buffer->device->ws->buffer_get_va(event->bo); + uint64_t va = radv_buffer_get_va(event->bo); cmd_buffer->device->ws->cs_add_buffer(cs, event->bo, 8); MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7); - radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); - radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit(cs, 1); /* reference value */ - radeon_emit(cs, 0xffffffff); /* mask */ - radeon_emit(cs, 4); /* poll interval */ - + si_emit_wait_fence(cs, false, va, 1, 0xffffffff); assert(cmd_buffer->cs->cdw <= cdw_max); } @@ -2972,7 +3675,7 @@ void radv_CmdWaitEvents(VkCommandBuffer commandBuffer, pImageMemoryBarriers[i].newLayout, pImageMemoryBarriers[i].srcQueueFamilyIndex, pImageMemoryBarriers[i].dstQueueFamilyIndex, - pImageMemoryBarriers[i].subresourceRange, + &pImageMemoryBarriers[i].subresourceRange, 0); }