X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_cmd_buffer.c;h=4db9d7628c29c0d33992ad1a2d802e7171d77e03;hb=d0d6a611d990c16136c2f27aeec192f37729fa0b;hp=f03e3dff3490c292d0064db718b70dbdd6e98f52;hpb=4f7fb25d4e92fb2b2a052f53225079641a896635;p=mesa.git diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f03e3dff349..4db9d7628c2 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -27,9 +27,12 @@ #include "radv_private.h" #include "radv_radeon_winsys.h" +#include "radv_shader.h" #include "radv_cs.h" #include "sid.h" +#include "gfx9d.h" #include "vk_format.h" +#include "radv_debug.h" #include "radv_meta.h" #include "ac_debug.h" @@ -80,14 +83,18 @@ radv_dynamic_state_copy(struct radv_dynamic_state *dest, const struct radv_dynamic_state *src, uint32_t copy_mask) { + /* Make sure to copy the number of viewports/scissors because they can + * only be specified at pipeline creation time. + */ + dest->viewport.count = src->viewport.count; + dest->scissor.count = src->scissor.count; + if (copy_mask & (1 << VK_DYNAMIC_STATE_VIEWPORT)) { - dest->viewport.count = src->viewport.count; typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count); } if (copy_mask & (1 << VK_DYNAMIC_STATE_SCISSOR)) { - dest->scissor.count = src->scissor.count; typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count); } @@ -140,7 +147,6 @@ static VkResult radv_create_cmd_buffer( VkCommandBuffer* pCommandBuffer) { struct radv_cmd_buffer *cmd_buffer; - VkResult result; unsigned ring; cmd_buffer = vk_alloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); @@ -169,8 +175,8 @@ static VkResult radv_create_cmd_buffer( cmd_buffer->cs = device->ws->cs_create(device->ws, ring); if (!cmd_buffer->cs) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail; + vk_free(&cmd_buffer->pool->alloc, cmd_buffer); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer); @@ -180,11 +186,6 @@ static VkResult radv_create_cmd_buffer( list_inithead(&cmd_buffer->upload.list); return VK_SUCCESS; - -fail: - vk_free(&cmd_buffer->pool->alloc, cmd_buffer); - - return result; } static void @@ -206,7 +207,8 @@ radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer) vk_free(&cmd_buffer->pool->alloc, cmd_buffer); } -static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) +static VkResult +radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) { cmd_buffer->device->ws->cs_reset(cmd_buffer->cs); @@ -218,6 +220,7 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) free(up); } + cmd_buffer->push_constant_stages = 0; cmd_buffer->scratch_size_needed = 0; cmd_buffer->compute_scratch_size_needed = 0; cmd_buffer->esgs_ring_size_needed = 0; @@ -230,9 +233,19 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->upload.upload_bo, 8); cmd_buffer->upload.offset = 0; - cmd_buffer->record_fail = false; + cmd_buffer->record_result = VK_SUCCESS; cmd_buffer->ring_offsets_idx = -1; + + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + void *fence_ptr; + radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 0, + &cmd_buffer->gfx9_fence_offset, + &fence_ptr); + cmd_buffer->gfx9_fence_bo = cmd_buffer->upload.upload_bo; + } + + return cmd_buffer->record_result; } static bool @@ -253,7 +266,7 @@ radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, RADEON_FLAG_CPU_ACCESS); if (!bo) { - cmd_buffer->record_fail = true; + cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; return false; } @@ -262,7 +275,7 @@ radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, upload = malloc(sizeof(*upload)); if (!upload) { - cmd_buffer->record_fail = true; + cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; device->ws->buffer_destroy(bo); return false; } @@ -277,7 +290,7 @@ radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo); if (!cmd_buffer->upload.map) { - cmd_buffer->record_fail = true; + cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; return false; } @@ -322,6 +335,19 @@ radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, return true; } +static void +radv_emit_write_data_packet(struct radeon_winsys_cs *cs, uint64_t va, + unsigned count, const uint32_t *data) +{ + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_ME)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit_array(cs, data, count); +} + void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer) { struct radv_device *device = cmd_buffer->device; @@ -331,23 +357,103 @@ void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer) if (!device->trace_bo) return; - va = device->ws->buffer_get_va(device->trace_bo); + va = radv_buffer_get_va(device->trace_bo); + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) + va += 4; MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 7); ++cmd_buffer->state.trace_id; device->ws->cs_add_buffer(cs, device->trace_bo, 8); - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); - radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) | - S_370_WR_CONFIRM(1) | - S_370_ENGINE_SEL(V_370_ME)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit(cs, cmd_buffer->state.trace_id); + radv_emit_write_data_packet(cs, va, 1, &cmd_buffer->state.trace_id); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id)); } +static void +radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer) +{ + if (cmd_buffer->device->debug_flags & RADV_DEBUG_SYNC_SHADERS) { + enum radv_cmd_flush_bits flags; + + /* Force wait for graphics/compute engines to be idle. */ + flags = RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_CS_PARTIAL_FLUSH; + + si_cs_emit_cache_flush(cmd_buffer->cs, false, + cmd_buffer->device->physical_device->rad_info.chip_class, + NULL, 0, + radv_cmd_buffer_uses_mec(cmd_buffer), + flags); + } + + radv_cmd_buffer_trace_emit(cmd_buffer); +} + +static void +radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline, enum ring_type ring) +{ + struct radv_device *device = cmd_buffer->device; + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint32_t data[2]; + uint64_t va; + + if (!device->trace_bo) + return; + + va = radv_buffer_get_va(device->trace_bo); + + switch (ring) { + case RING_GFX: + va += 8; + break; + case RING_COMPUTE: + va += 16; + break; + default: + assert(!"invalid ring type"); + } + + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(device->ws, + cmd_buffer->cs, 6); + + data[0] = (uintptr_t)pipeline; + data[1] = (uintptr_t)pipeline >> 32; + + device->ws->cs_add_buffer(cs, device->trace_bo, 8); + radv_emit_write_data_packet(cs, va, 2, data); +} + +static void +radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_device *device = cmd_buffer->device; + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint32_t data[MAX_SETS * 2] = {}; + uint64_t va; + + if (!device->trace_bo) + return; + + va = radv_buffer_get_va(device->trace_bo) + 24; + + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(device->ws, + cmd_buffer->cs, 4 + MAX_SETS * 2); + + for (int i = 0; i < MAX_SETS; i++) { + struct radv_descriptor_set *set = cmd_buffer->state.descriptors[i]; + if (!set) + continue; + + data[i * 2] = (uintptr_t)set; + data[i * 2 + 1] = (uintptr_t)set >> 32; + } + + device->ws->cs_add_buffer(cs, device->trace_bo, 8); + radv_emit_write_data_packet(cs, va, MAX_SETS * 2, data); +} + static void radv_emit_graphics_blend_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) @@ -357,6 +463,17 @@ radv_emit_graphics_blend_state(struct radv_cmd_buffer *cmd_buffer, 8); radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, pipeline->graphics.blend.cb_color_control); radeon_set_context_reg(cmd_buffer->cs, R_028B70_DB_ALPHA_TO_MASK, pipeline->graphics.blend.db_alpha_to_mask); + + if (cmd_buffer->device->physical_device->has_rbplus) { + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028760_SX_MRT0_BLEND_OPT, 8); + radeon_emit_array(cmd_buffer->cs, pipeline->graphics.blend.sx_mrt_blend_opt, 8); + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3); + radeon_emit(cmd_buffer->cs, 0); /* R_028754_SX_PS_DOWNCONVERT */ + radeon_emit(cmd_buffer->cs, 0); /* R_028758_SX_BLEND_OPT_EPSILON */ + radeon_emit(cmd_buffer->cs, 0); /* R_02875C_SX_BLEND_OPT_CONTROL */ + } } static void @@ -378,34 +495,7 @@ static unsigned radv_pack_float_12p4(float x) x >= 4096 ? 0xffff : x * 16; } -static uint32_t -shader_stage_to_user_data_0(gl_shader_stage stage, bool has_gs, bool has_tess) -{ - switch (stage) { - case MESA_SHADER_FRAGMENT: - return R_00B030_SPI_SHADER_USER_DATA_PS_0; - case MESA_SHADER_VERTEX: - if (has_tess) - return R_00B530_SPI_SHADER_USER_DATA_LS_0; - else - return has_gs ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : R_00B130_SPI_SHADER_USER_DATA_VS_0; - case MESA_SHADER_GEOMETRY: - return R_00B230_SPI_SHADER_USER_DATA_GS_0; - case MESA_SHADER_COMPUTE: - return R_00B900_COMPUTE_USER_DATA_0; - case MESA_SHADER_TESS_CTRL: - return R_00B430_SPI_SHADER_USER_DATA_HS_0; - case MESA_SHADER_TESS_EVAL: - if (has_gs) - return R_00B330_SPI_SHADER_USER_DATA_ES_0; - else - return R_00B130_SPI_SHADER_USER_DATA_VS_0; - default: - unreachable("unknown shader"); - } -} - -static struct ac_userdata_info * +struct ac_userdata_info * radv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx) @@ -420,7 +510,7 @@ radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer, int idx, uint64_t va) { struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx); - uint32_t base_reg = shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + uint32_t base_reg = radv_shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); if (loc->sgpr_idx == -1) return; assert(loc->num_sgprs == 2); @@ -454,10 +544,15 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples); - if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.uses_sample_positions) { + /* GFX9: Flush DFSM when the AA mode changes. */ + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + } + if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.needs_sample_positions) { uint32_t offset; struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_FRAGMENT, AC_UD_PS_SAMPLE_POS_OFFSET); - uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_FRAGMENT, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + uint32_t base_reg = radv_shader_stage_to_user_data_0(MESA_SHADER_FRAGMENT, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); if (loc->sgpr_idx == -1) return; assert(loc->num_sgprs == 1); @@ -510,6 +605,14 @@ radv_emit_graphics_raster_state(struct radv_cmd_buffer *cmd_buffer, raster->pa_su_sc_mode_cntl); } +static inline void +radv_emit_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va, + unsigned size) +{ + if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) + si_cp_dma_prefetch(cmd_buffer, va, size); +} + static void radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, @@ -517,10 +620,11 @@ radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer, struct ac_vs_output_info *outinfo) { struct radeon_winsys *ws = cmd_buffer->device->ws; - uint64_t va = ws->buffer_get_va(shader->bo); + uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset; unsigned export_count; ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8); + radv_emit_prefetch(cmd_buffer, va, shader->code_size); export_count = MAX2(1, outinfo->param_exports); radeon_set_context_reg(cmd_buffer->cs, R_0286C4_SPI_VS_OUT_CONFIG, @@ -555,8 +659,9 @@ radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer, radeon_set_context_reg(cmd_buffer->cs, R_02881C_PA_CL_VS_OUT_CNTL, pipeline->graphics.pa_cl_vs_out_cntl); - radeon_set_context_reg(cmd_buffer->cs, R_028AB4_VGT_REUSE_OFF, - S_028AB4_REUSE_OFF(outinfo->writes_viewport_index)); + if (cmd_buffer->device->physical_device->rad_info.chip_class <= VI) + radeon_set_context_reg(cmd_buffer->cs, R_028AB4_VGT_REUSE_OFF, + S_028AB4_REUSE_OFF(outinfo->writes_viewport_index)); } static void @@ -565,9 +670,10 @@ radv_emit_hw_es(struct radv_cmd_buffer *cmd_buffer, struct ac_es_output_info *outinfo) { struct radeon_winsys *ws = cmd_buffer->device->ws; - uint64_t va = ws->buffer_get_va(shader->bo); + uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset; ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8); + radv_emit_prefetch(cmd_buffer, va, shader->code_size); radeon_set_context_reg(cmd_buffer->cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE, outinfo->esgs_itemsize / 4); @@ -583,10 +689,11 @@ radv_emit_hw_ls(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *shader) { struct radeon_winsys *ws = cmd_buffer->device->ws; - uint64_t va = ws->buffer_get_va(shader->bo); + uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset; uint32_t rsrc2 = shader->rsrc2; ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8); + radv_emit_prefetch(cmd_buffer, va, shader->code_size); radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2); radeon_emit(cmd_buffer->cs, va >> 8); @@ -607,9 +714,10 @@ radv_emit_hw_hs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *shader) { struct radeon_winsys *ws = cmd_buffer->device->ws; - uint64_t va = ws->buffer_get_va(shader->bo); + uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset; ws->cs_add_buffer(cmd_buffer->cs, shader->bo, 8); + radv_emit_prefetch(cmd_buffer, va, shader->code_size); radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B420_SPI_SHADER_PGM_LO_HS, 4); radeon_emit(cmd_buffer->cs, va >> 8); @@ -635,7 +743,7 @@ radv_emit_vertex_shader(struct radv_cmd_buffer *cmd_buffer, else radv_emit_hw_vs(cmd_buffer, pipeline, vs, &vs->info.vs.outinfo); - radeon_set_context_reg(cmd_buffer->cs, R_028A84_VGT_PRIMITIVEID_EN, 0); + radeon_set_context_reg(cmd_buffer->cs, R_028A84_VGT_PRIMITIVEID_EN, pipeline->graphics.vgt_primitiveid_en); } @@ -672,7 +780,7 @@ radv_emit_tess_shaders(struct radv_cmd_buffer *cmd_buffer, loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_TESS_CTRL, AC_UD_TCS_OFFCHIP_LAYOUT); if (loc->sgpr_idx != -1) { - uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_TESS_CTRL, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + uint32_t base_reg = radv_shader_stage_to_user_data_0(MESA_SHADER_TESS_CTRL, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); assert(loc->num_sgprs == 4); assert(!loc->indirect); radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 4); @@ -685,7 +793,7 @@ radv_emit_tess_shaders(struct radv_cmd_buffer *cmd_buffer, loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_TESS_EVAL, AC_UD_TES_OFFCHIP_LAYOUT); if (loc->sgpr_idx != -1) { - uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_TESS_EVAL, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + uint32_t base_reg = radv_shader_stage_to_user_data_0(MESA_SHADER_TESS_EVAL, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); assert(loc->num_sgprs == 1); assert(!loc->indirect); @@ -695,7 +803,7 @@ radv_emit_tess_shaders(struct radv_cmd_buffer *cmd_buffer, loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_VERTEX, AC_UD_VS_LS_TCS_IN_LAYOUT); if (loc->sgpr_idx != -1) { - uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + uint32_t base_reg = radv_shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); assert(loc->num_sgprs == 1); assert(!loc->indirect); @@ -741,8 +849,10 @@ radv_emit_geometry_shader(struct radv_cmd_buffer *cmd_buffer, S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0)); - va = ws->buffer_get_va(gs->bo); + va = radv_buffer_get_va(gs->bo) + gs->bo_offset; ws->cs_add_buffer(cmd_buffer->cs, gs->bo, 8); + radv_emit_prefetch(cmd_buffer, va, gs->code_size); + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4); radeon_emit(cmd_buffer->cs, va >> 8); radeon_emit(cmd_buffer->cs, va >> 40); @@ -780,9 +890,9 @@ radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer, assert (pipeline->shaders[MESA_SHADER_FRAGMENT]); ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; - - va = ws->buffer_get_va(ps->bo); + va = radv_buffer_get_va(ps->bo) + ps->bo_offset; ws->cs_add_buffer(cmd_buffer->cs, ps->bo, 8); + radv_emit_prefetch(cmd_buffer, va, ps->code_size); radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4); radeon_emit(cmd_buffer->cs, va >> 8); @@ -799,7 +909,7 @@ radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer, radeon_set_context_reg(cmd_buffer->cs, R_0286D0_SPI_PS_INPUT_ADDR, ps->config.spi_ps_input_addr); - if (ps->info.fs.force_persample) + if (ps->info.info.ps.force_persample) spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2); radeon_set_context_reg(cmd_buffer->cs, R_0286D8_SPI_PS_IN_CONTROL, @@ -815,6 +925,12 @@ radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer, radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK, blend->cb_target_mask); radeon_set_context_reg(cmd_buffer->cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask); + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + /* optimise this? */ + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + } + if (pipeline->graphics.ps_input_cntl_num) { radeon_set_context_reg_seq(cmd_buffer->cs, R_028644_SPI_PS_INPUT_CNTL_0, pipeline->graphics.ps_input_cntl_num); for (unsigned i = 0; i < pipeline->graphics.ps_input_cntl_num; i++) { @@ -839,9 +955,10 @@ static void polaris_set_vgt_vertex_reuse(struct radv_cmd_buffer *cmd_buffer, } static void -radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer, - struct radv_pipeline *pipeline) +radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) { + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) return; @@ -867,6 +984,18 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband != pipeline->graphics.can_use_guardband) cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; + + radeon_set_context_reg(cmd_buffer->cs, R_028B54_VGT_SHADER_STAGES_EN, pipeline->graphics.vgt_shader_stages_en); + + if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) { + radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, pipeline->graphics.prim); + } else { + radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, pipeline->graphics.prim); + } + radeon_set_context_reg(cmd_buffer->cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, pipeline->graphics.gs_out); + + radv_save_pipeline(cmd_buffer, pipeline, RING_GFX); + cmd_buffer->state.emitted_pipeline = pipeline; } @@ -881,6 +1010,11 @@ static void radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer) { uint32_t count = cmd_buffer->state.dynamic.scissor.count; + + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH; + si_emit_cache_flush(cmd_buffer); + } si_write_scissors(cmd_buffer->cs, 0, count, cmd_buffer->state.dynamic.scissor.scissors, cmd_buffer->state.dynamic.viewport.viewports, @@ -889,27 +1023,117 @@ radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->state.pipeline->graphics.ms.pa_sc_mode_cntl_0 | S_028A48_VPORT_SCISSOR_ENABLE(count ? 1 : 0)); } +static void +radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer) +{ + unsigned width = cmd_buffer->state.dynamic.line_width * 8; + + radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL, + S_028A08_WIDTH(CLAMP(width, 0, 0xFFF))); +} + +static void +radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4); + radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4); +} + +static void +radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; + + radeon_set_context_reg_seq(cmd_buffer->cs, + R_028430_DB_STENCILREFMASK, 2); + radeon_emit(cmd_buffer->cs, + S_028430_STENCILTESTVAL(d->stencil_reference.front) | + S_028430_STENCILMASK(d->stencil_compare_mask.front) | + S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) | + S_028430_STENCILOPVAL(1)); + radeon_emit(cmd_buffer->cs, + S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) | + S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) | + S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) | + S_028434_STENCILOPVAL_BF(1)); +} + +static void +radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; + + radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, + fui(d->depth_bounds.min)); + radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX, + fui(d->depth_bounds.max)); +} + +static void +radv_emit_depth_biais(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_raster_state *raster = &cmd_buffer->state.pipeline->graphics.raster; + struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; + unsigned slope = fui(d->depth_bias.slope * 16.0f); + unsigned bias = fui(d->depth_bias.bias * cmd_buffer->state.offset_scale); + + if (G_028814_POLY_OFFSET_FRONT_ENABLE(raster->pa_su_sc_mode_cntl)) { + radeon_set_context_reg_seq(cmd_buffer->cs, + R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5); + radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */ + radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */ + radeon_emit(cmd_buffer->cs, bias); /* FRONT OFFSET */ + radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */ + radeon_emit(cmd_buffer->cs, bias); /* BACK OFFSET */ + } +} + static void radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index, struct radv_color_buffer_info *cb) { bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= VI; - radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); - radeon_emit(cmd_buffer->cs, cb->cb_color_base); - radeon_emit(cmd_buffer->cs, cb->cb_color_pitch); - radeon_emit(cmd_buffer->cs, cb->cb_color_slice); - radeon_emit(cmd_buffer->cs, cb->cb_color_view); - radeon_emit(cmd_buffer->cs, cb->cb_color_info); - radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); - radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); - radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); - radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice); - radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); - radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice); - if (is_vi) { /* DCC BASE */ - radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base); + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); + radeon_emit(cmd_buffer->cs, cb->cb_color_base); + radeon_emit(cmd_buffer->cs, cb->cb_color_base >> 32); + radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2); + radeon_emit(cmd_buffer->cs, cb->cb_color_view); + radeon_emit(cmd_buffer->cs, cb->cb_color_info); + radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); + radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); + radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); + radeon_emit(cmd_buffer->cs, cb->cb_color_cmask >> 32); + radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); + radeon_emit(cmd_buffer->cs, cb->cb_color_fmask >> 32); + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2); + radeon_emit(cmd_buffer->cs, cb->cb_dcc_base); + radeon_emit(cmd_buffer->cs, cb->cb_dcc_base >> 32); + + radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4, + cb->gfx9_epitch); + } else { + radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); + radeon_emit(cmd_buffer->cs, cb->cb_color_base); + radeon_emit(cmd_buffer->cs, cb->cb_color_pitch); + radeon_emit(cmd_buffer->cs, cb->cb_color_slice); + radeon_emit(cmd_buffer->cs, cb->cb_color_view); + radeon_emit(cmd_buffer->cs, cb->cb_color_info); + radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); + radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); + radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); + radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice); + radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); + radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice); + + if (is_vi) { /* DCC BASE */ + radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base); + } } } @@ -920,60 +1144,59 @@ radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, VkImageLayout layout) { uint32_t db_z_info = ds->db_z_info; + uint32_t db_stencil_info = ds->db_stencil_info; - if (!radv_layout_has_htile(image, layout)) + if (!radv_layout_has_htile(image, layout, + radv_image_queue_family_mask(image, + cmd_buffer->queue_family_index, + cmd_buffer->queue_family_index))) { db_z_info &= C_028040_TILE_SURFACE_ENABLE; - - if (!radv_layout_can_expclear(image, layout)) - db_z_info &= C_028040_ALLOW_EXPCLEAR & C_028044_ALLOW_EXPCLEAR; + db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1); + } radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view); - radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base); - - radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9); - radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */ - radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */ - radeon_emit(cmd_buffer->cs, ds->db_stencil_info); /* R_028044_DB_STENCIL_INFO */ - radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */ - radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */ - radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */ - radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */ - radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */ - radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */ - radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface); - radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, - ds->pa_su_poly_offset_db_fmt_cntl); -} -/* - * To hw resolve multisample images both src and dst need to have the same - * micro tiling mode. However we don't always know in advance when creating - * the images. This function gets called if we have a resolve attachment, - * and tests if the attachment image has the same tiling mode, then it - * checks if the generated framebuffer data has the same tiling mode, and - * updates it if not. - */ -static void radv_set_optimal_micro_tile_mode(struct radv_device *device, - struct radv_attachment_info *att, - uint32_t micro_tile_mode) -{ - struct radv_image *image = att->attachment->image; - uint32_t tile_mode_index; - if (image->surface.nsamples <= 1) - return; - if (image->surface.micro_tile_mode != micro_tile_mode) { - radv_image_set_optimal_micro_tile_mode(device, image, micro_tile_mode); - } + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3); + radeon_emit(cmd_buffer->cs, ds->db_htile_data_base); + radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32); + radeon_emit(cmd_buffer->cs, ds->db_depth_size); + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10); + radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */ + radeon_emit(cmd_buffer->cs, db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32); /* DB_Z_READ_BASE_HI */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32); /* DB_STENCIL_READ_BASE_HI */ + radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_z_write_base >> 32); /* DB_Z_WRITE_BASE_HI */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ + + radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2); + radeon_emit(cmd_buffer->cs, ds->db_z_info2); + radeon_emit(cmd_buffer->cs, ds->db_stencil_info2); + } else { + radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base); - if (att->cb.micro_tile_mode != micro_tile_mode) { - tile_mode_index = image->surface.tiling_index[0]; + radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9); + radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */ + radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */ + radeon_emit(cmd_buffer->cs, db_stencil_info); /* R_028044_DB_STENCIL_INFO */ + radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */ + radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */ + radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */ - att->cb.cb_color_attrib &= C_028C74_TILE_MODE_INDEX; - att->cb.cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index); - att->cb.micro_tile_mode = micro_tile_mode; } + + radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + ds->pa_su_poly_offset_db_fmt_cntl); } void @@ -982,7 +1205,7 @@ radv_set_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer, VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) { - uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo); + uint64_t va = radv_buffer_get_va(image->bo); va += image->offset + image->clear_value_offset; unsigned reg_offset = 0, reg_count = 0; @@ -1022,7 +1245,7 @@ static void radv_load_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image) { - uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo); + uint64_t va = radv_buffer_get_va(image->bo); va += image->offset + image->clear_value_offset; if (!image->surface.htile_size) @@ -1043,13 +1266,42 @@ radv_load_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cmd_buffer->cs, 0); } +/* + *with DCC some colors don't require CMASK elimiation before being + * used as a texture. This sets a predicate value to determine if the + * cmask eliminate is required. + */ +void +radv_set_dcc_need_cmask_elim_pred(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + bool value) +{ + uint64_t pred_val = value; + uint64_t va = radv_buffer_get_va(image->bo); + va += image->offset + image->dcc_pred_offset; + + if (!image->surface.dcc_size) + return; + + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, image->bo, 8); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 4, 0)); + radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM_ASYNC) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_PFP)); + radeon_emit(cmd_buffer->cs, va); + radeon_emit(cmd_buffer->cs, va >> 32); + radeon_emit(cmd_buffer->cs, pred_val); + radeon_emit(cmd_buffer->cs, pred_val >> 32); +} + void radv_set_color_clear_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, int idx, uint32_t color_values[2]) { - uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo); + uint64_t va = radv_buffer_get_va(image->bo); va += image->offset + image->clear_value_offset; if (!image->cmask.size && !image->surface.dcc_size) @@ -1076,7 +1328,7 @@ radv_load_color_clear_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, int idx) { - uint64_t va = cmd_buffer->device->ws->buffer_get_va(image->bo); + uint64_t va = radv_buffer_get_va(image->bo); va += image->offset + image->clear_value_offset; if (!image->cmask.size && !image->surface.dcc_size) @@ -1085,7 +1337,7 @@ radv_load_color_clear_regs(struct radv_cmd_buffer *cmd_buffer, uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + idx * 0x3c; cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, image->bo, 8); - radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating)); radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_COUNT_SEL); @@ -1094,7 +1346,7 @@ radv_load_color_clear_regs(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cmd_buffer->cs, reg >> 2); radeon_emit(cmd_buffer->cs, 0); - radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); radeon_emit(cmd_buffer->cs, 0); } @@ -1104,21 +1356,21 @@ radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer) int i; struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; const struct radv_subpass *subpass = cmd_buffer->state.subpass; - int dst_resolve_micro_tile_mode = -1; - if (subpass->has_resolve) { - uint32_t a = subpass->resolve_attachments[0].attachment; - const struct radv_image *image = framebuffer->attachments[a].attachment->image; - dst_resolve_micro_tile_mode = image->surface.micro_tile_mode; - } - for (i = 0; i < subpass->color_count; ++i) { + /* this may happen for inherited secondary recording */ + if (!framebuffer) + return; + + for (i = 0; i < 8; ++i) { + if (i >= subpass->color_count || subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { + radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, + S_028C70_FORMAT(V_028C70_COLOR_INVALID)); + continue; + } + int idx = subpass->color_attachments[i].attachment; struct radv_attachment_info *att = &framebuffer->attachments[idx]; - if (dst_resolve_micro_tile_mode != -1) { - radv_set_optimal_micro_tile_mode(cmd_buffer->device, - att, dst_resolve_micro_tile_mode); - } cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, att->attachment->bo, 8); assert(att->attachment->aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT); @@ -1127,16 +1379,18 @@ radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer) radv_load_color_clear_regs(cmd_buffer, att->attachment->image, i); } - for (i = subpass->color_count; i < 8; i++) - radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, - S_028C70_FORMAT(V_028C70_COLOR_INVALID)); - if(subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) { int idx = subpass->depth_stencil_attachment.attachment; VkImageLayout layout = subpass->depth_stencil_attachment.layout; struct radv_attachment_info *att = &framebuffer->attachments[idx]; struct radv_image *image = att->attachment->image; cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, att->attachment->bo, 8); + MAYBE_UNUSED uint32_t queue_mask = radv_image_queue_family_mask(image, + cmd_buffer->queue_family_index, + cmd_buffer->queue_family_index); + /* We currently don't support writing decompressed HTILE */ + assert(radv_layout_has_htile(image, layout, queue_mask) == + radv_layout_is_htile_compressed(image, layout, queue_mask)); radv_emit_fb_ds_state(cmd_buffer, &att->ds, image, layout); @@ -1146,13 +1400,22 @@ radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer) } radv_load_depth_clear_regs(cmd_buffer, image); } else { - radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2); - radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* R_028040_DB_Z_INFO */ - radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* R_028044_DB_STENCIL_INFO */ + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) + radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2); + else + radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2); + + radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */ + radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ } radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height)); + + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + } } void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer) @@ -1184,54 +1447,33 @@ void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer) static void radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer) { - struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; + if (G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.pipeline->graphics.raster.pa_cl_clip_cntl)) + return; - if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH) { - unsigned width = cmd_buffer->state.dynamic.line_width * 8; - radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL, - S_028A08_WIDTH(CLAMP(width, 0, 0xFFF))); - } + if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT)) + radv_emit_viewport(cmd_buffer); - if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) { - radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4); - radeon_emit_array(cmd_buffer->cs, (uint32_t*)d->blend_constants, 4); - } + if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT)) + radv_emit_scissor(cmd_buffer); + + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH) + radv_emit_line_width(cmd_buffer); + + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) + radv_emit_blend_constants(cmd_buffer); if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK | - RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK)) { - radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2); - radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) | - S_028430_STENCILMASK(d->stencil_compare_mask.front) | - S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) | - S_028430_STENCILOPVAL(1)); - radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) | - S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) | - S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) | - S_028434_STENCILOPVAL_BF(1)); - } + RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK)) + radv_emit_stencil(cmd_buffer); if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_PIPELINE | - RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)) { - radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, fui(d->depth_bounds.min)); - radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX, fui(d->depth_bounds.max)); - } + RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)) + radv_emit_depth_bounds(cmd_buffer); if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_PIPELINE | - RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)) { - struct radv_raster_state *raster = &cmd_buffer->state.pipeline->graphics.raster; - unsigned slope = fui(d->depth_bias.slope * 16.0f); - unsigned bias = fui(d->depth_bias.bias * cmd_buffer->state.offset_scale); - - if (G_028814_POLY_OFFSET_FRONT_ENABLE(raster->pa_su_sc_mode_cntl)) { - radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5); - radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */ - radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */ - radeon_emit(cmd_buffer->cs, bias); /* FRONT OFFSET */ - radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */ - radeon_emit(cmd_buffer->cs, bias); /* BACK OFFSET */ - } - } + RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)) + radv_emit_depth_biais(cmd_buffer); cmd_buffer->state.dirty = 0; } @@ -1244,9 +1486,9 @@ emit_stage_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer, gl_shader_stage stage) { struct ac_userdata_info *desc_set_loc = &pipeline->shaders[stage]->info.user_sgprs_locs.descriptor_sets[idx]; - uint32_t base_reg = shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + uint32_t base_reg = radv_shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); - if (desc_set_loc->sgpr_idx == -1) + if (desc_set_loc->sgpr_idx == -1 || desc_set_loc->indirect) return; assert(!desc_set_loc->indirect); @@ -1259,38 +1501,21 @@ emit_stage_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer, static void radv_emit_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer, - struct radv_pipeline *pipeline, VkShaderStageFlags stages, struct radv_descriptor_set *set, unsigned idx) { - if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) - emit_stage_descriptor_set_userdata(cmd_buffer, pipeline, - idx, set->va, - MESA_SHADER_FRAGMENT); - - if (stages & VK_SHADER_STAGE_VERTEX_BIT) - emit_stage_descriptor_set_userdata(cmd_buffer, pipeline, - idx, set->va, - MESA_SHADER_VERTEX); - - if ((stages & VK_SHADER_STAGE_GEOMETRY_BIT) && radv_pipeline_has_gs(pipeline)) - emit_stage_descriptor_set_userdata(cmd_buffer, pipeline, - idx, set->va, - MESA_SHADER_GEOMETRY); - - if ((stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) && radv_pipeline_has_tess(pipeline)) - emit_stage_descriptor_set_userdata(cmd_buffer, pipeline, - idx, set->va, - MESA_SHADER_TESS_CTRL); - - if ((stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) && radv_pipeline_has_tess(pipeline)) - emit_stage_descriptor_set_userdata(cmd_buffer, pipeline, - idx, set->va, - MESA_SHADER_TESS_EVAL); + if (cmd_buffer->state.pipeline) { + radv_foreach_stage(stage, stages) { + if (cmd_buffer->state.pipeline->shaders[stage]) + emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline, + idx, set->va, + stage); + } + } - if (stages & VK_SHADER_STAGE_COMPUTE_BIT) - emit_stage_descriptor_set_userdata(cmd_buffer, pipeline, + if (cmd_buffer->state.compute_pipeline && (stages & VK_SHADER_STAGE_COMPUTE_BIT)) + emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.compute_pipeline, idx, set->va, MESA_SHADER_COMPUTE); } @@ -1299,43 +1524,102 @@ static void radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer) { struct radv_descriptor_set *set = &cmd_buffer->push_descriptors.set; - uint32_t *ptr = NULL; unsigned bo_offset; - if (!radv_cmd_buffer_upload_alloc(cmd_buffer, set->size, 32, - &bo_offset, - (void**) &ptr)) + if (!radv_cmd_buffer_upload_data(cmd_buffer, set->size, 32, + set->mapped_ptr, + &bo_offset)) return; - set->va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); + set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); set->va += bo_offset; +} + +static void +radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer) +{ + uint32_t size = MAX_SETS * 2 * 4; + uint32_t offset; + void *ptr; + + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, + 256, &offset, &ptr)) + return; + + for (unsigned i = 0; i < MAX_SETS; i++) { + uint32_t *uptr = ((uint32_t *)ptr) + i * 2; + uint64_t set_va = 0; + struct radv_descriptor_set *set = cmd_buffer->state.descriptors[i]; + if (set) + set_va = set->va; + uptr[0] = set_va & 0xffffffff; + uptr[1] = set_va >> 32; + } + + uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); + va += offset; - memcpy(ptr, set->mapped_ptr, set->size); + if (cmd_buffer->state.pipeline) { + if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX]) + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX, + AC_UD_INDIRECT_DESCRIPTOR_SETS, va); + + if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]) + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_FRAGMENT, + AC_UD_INDIRECT_DESCRIPTOR_SETS, va); + + if (radv_pipeline_has_gs(cmd_buffer->state.pipeline)) + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY, + AC_UD_INDIRECT_DESCRIPTOR_SETS, va); + + if (radv_pipeline_has_tess(cmd_buffer->state.pipeline)) + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_CTRL, + AC_UD_INDIRECT_DESCRIPTOR_SETS, va); + + if (radv_pipeline_has_tess(cmd_buffer->state.pipeline)) + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_EVAL, + AC_UD_INDIRECT_DESCRIPTOR_SETS, va); + } + + if (cmd_buffer->state.compute_pipeline) + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.compute_pipeline, MESA_SHADER_COMPUTE, + AC_UD_INDIRECT_DESCRIPTOR_SETS, va); } static void radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, - struct radv_pipeline *pipeline, VkShaderStageFlags stages) { unsigned i; + if (!cmd_buffer->state.descriptors_dirty) return; if (cmd_buffer->state.push_descriptors_dirty) radv_flush_push_descriptors(cmd_buffer); - for (i = 0; i < MAX_SETS; i++) { - if (!(cmd_buffer->state.descriptors_dirty & (1 << i))) - continue; + if ((cmd_buffer->state.pipeline && cmd_buffer->state.pipeline->need_indirect_descriptor_sets) || + (cmd_buffer->state.compute_pipeline && cmd_buffer->state.compute_pipeline->need_indirect_descriptor_sets)) { + radv_flush_indirect_descriptor_sets(cmd_buffer); + } + + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, + cmd_buffer->cs, + MAX_SETS * MESA_SHADER_STAGES * 4); + + for_each_bit(i, cmd_buffer->state.descriptors_dirty) { struct radv_descriptor_set *set = cmd_buffer->state.descriptors[i]; if (!set) continue; - radv_emit_descriptor_set_userdata(cmd_buffer, pipeline, stages, set, i); + radv_emit_descriptor_set_userdata(cmd_buffer, stages, set, i); } cmd_buffer->state.descriptors_dirty = 0; cmd_buffer->state.push_descriptors_dirty = false; + + radv_save_descriptors(cmd_buffer); + + assert(cmd_buffer->cs->cdw <= cdw_max); } static void @@ -1361,34 +1645,21 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers, 16 * layout->dynamic_offset_count); - va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); + va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); va += offset; - if (stages & VK_SHADER_STAGE_VERTEX_BIT) - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX, - AC_UD_PUSH_CONSTANTS, va); - - if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT, - AC_UD_PUSH_CONSTANTS, va); - - if ((stages & VK_SHADER_STAGE_GEOMETRY_BIT) && radv_pipeline_has_gs(pipeline)) - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_GEOMETRY, - AC_UD_PUSH_CONSTANTS, va); - - if ((stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) && radv_pipeline_has_tess(pipeline)) - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_CTRL, - AC_UD_PUSH_CONSTANTS, va); - - if ((stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) && radv_pipeline_has_tess(pipeline)) - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_EVAL, - AC_UD_PUSH_CONSTANTS, va); + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, + cmd_buffer->cs, MESA_SHADER_STAGES * 4); - if (stages & VK_SHADER_STAGE_COMPUTE_BIT) - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_COMPUTE, - AC_UD_PUSH_CONSTANTS, va); + radv_foreach_stage(stage, stages) { + if (pipeline->shaders[stage]) { + radv_emit_userdata_address(cmd_buffer, pipeline, stage, + AC_UD_PUSH_CONSTANTS, va); + } + } cmd_buffer->push_constant_stages &= ~stages; + assert(cmd_buffer->cs->cdw <= cdw_max); } static void radv_emit_primitive_reset_state(struct radv_cmd_buffer *cmd_buffer, @@ -1398,8 +1669,13 @@ static void radv_emit_primitive_reset_state(struct radv_cmd_buffer *cmd_buffer, if (primitive_reset_en != cmd_buffer->state.last_primitive_reset_en) { cmd_buffer->state.last_primitive_reset_en = primitive_reset_en; - radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, - primitive_reset_en); + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, + primitive_reset_en); + } else { + radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, + primitive_reset_en); + } } if (primitive_reset_en) { @@ -1413,99 +1689,94 @@ static void radv_emit_primitive_reset_state(struct radv_cmd_buffer *cmd_buffer, } } -static void -radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer, - bool indexed_draw, bool instanced_draw, - bool indirect_draw, - uint32_t draw_vertex_count) +static bool +radv_cmd_buffer_update_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer) { - struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; struct radv_device *device = cmd_buffer->device; - uint32_t ia_multi_vgt_param; - - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, - cmd_buffer->cs, 4096); - if ((cmd_buffer->state.vertex_descriptors_dirty || cmd_buffer->state.vb_dirty) && - cmd_buffer->state.pipeline->num_vertex_attribs) { + if ((cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline || cmd_buffer->state.vb_dirty) && + cmd_buffer->state.pipeline->vertex_elements.count && + cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX]->info.info.vs.has_vertex_buffers) { + struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements; unsigned vb_offset; void *vb_ptr; uint32_t i = 0; - uint32_t num_attribs = cmd_buffer->state.pipeline->num_vertex_attribs; + uint32_t count = velems->count; uint64_t va; /* allocate some descriptor state for vertex buffers */ - radv_cmd_buffer_upload_alloc(cmd_buffer, num_attribs * 16, 256, - &vb_offset, &vb_ptr); + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, count * 16, 256, + &vb_offset, &vb_ptr)) + return false; - for (i = 0; i < num_attribs; i++) { + for (i = 0; i < count; i++) { uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4]; uint32_t offset; - int vb = cmd_buffer->state.pipeline->va_binding[i]; + int vb = velems->binding[i]; struct radv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer; uint32_t stride = cmd_buffer->state.pipeline->binding_stride[vb]; device->ws->cs_add_buffer(cmd_buffer->cs, buffer->bo, 8); - va = device->ws->buffer_get_va(buffer->bo); + va = radv_buffer_get_va(buffer->bo); - offset = cmd_buffer->state.vertex_bindings[vb].offset + cmd_buffer->state.pipeline->va_offset[i]; + offset = cmd_buffer->state.vertex_bindings[vb].offset + velems->offset[i]; va += offset + buffer->offset; desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); if (cmd_buffer->device->physical_device->rad_info.chip_class <= CIK && stride) - desc[2] = (buffer->size - offset - cmd_buffer->state.pipeline->va_format_size[i]) / stride + 1; + desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1; else desc[2] = buffer->size - offset; - desc[3] = cmd_buffer->state.pipeline->va_rsrc_word3[i]; + desc[3] = velems->rsrc_word3[i]; } - va = device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); + va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); va += vb_offset; - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX, + radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS, va); } + cmd_buffer->state.vb_dirty = false; + + return true; +} + +static void +radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer, + bool indexed_draw, bool instanced_draw, + bool indirect_draw, + uint32_t draw_vertex_count) +{ + uint32_t ia_multi_vgt_param; + + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, + cmd_buffer->cs, 4096); + + if (!radv_cmd_buffer_update_vertex_descriptors(cmd_buffer)) + return; - cmd_buffer->state.vertex_descriptors_dirty = false; - cmd_buffer->state.vb_dirty = 0; if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) - radv_emit_graphics_pipeline(cmd_buffer, pipeline); + radv_emit_graphics_pipeline(cmd_buffer); if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_RENDER_TARGETS) radv_emit_framebuffer_state(cmd_buffer); - if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT)) - radv_emit_viewport(cmd_buffer); - - if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT)) - radv_emit_scissor(cmd_buffer); - ia_multi_vgt_param = si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, draw_vertex_count); if (cmd_buffer->state.last_ia_multi_vgt_param != ia_multi_vgt_param) { - if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) + radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param); + else if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) radeon_set_context_reg_idx(cmd_buffer->cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param); else radeon_set_context_reg(cmd_buffer->cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); cmd_buffer->state.last_ia_multi_vgt_param = ia_multi_vgt_param; } - if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) { - radeon_set_context_reg(cmd_buffer->cs, R_028B54_VGT_SHADER_STAGES_EN, pipeline->graphics.vgt_shader_stages_en); - - if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) { - radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, cmd_buffer->state.pipeline->graphics.prim); - } else { - radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, cmd_buffer->state.pipeline->graphics.prim); - } - radeon_set_context_reg(cmd_buffer->cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, cmd_buffer->state.pipeline->graphics.gs_out); - } - radv_cmd_buffer_flush_dynamic_state(cmd_buffer); radv_emit_primitive_reset_state(cmd_buffer, indexed_draw); - radv_flush_descriptors(cmd_buffer, cmd_buffer->state.pipeline, - VK_SHADER_STAGE_ALL_GRAPHICS); + radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); radv_flush_constants(cmd_buffer, cmd_buffer->state.pipeline, VK_SHADER_STAGE_ALL_GRAPHICS); @@ -1657,8 +1928,9 @@ radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, radv_subpass_barrier(cmd_buffer, &subpass->start_barrier); for (unsigned i = 0; i < subpass->color_count; ++i) { - radv_handle_subpass_image_transition(cmd_buffer, - subpass->color_attachments[i]); + if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED) + radv_handle_subpass_image_transition(cmd_buffer, + subpass->color_attachments[i]); } for (unsigned i = 0; i < subpass->input_count; ++i) { @@ -1677,7 +1949,7 @@ radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RENDER_TARGETS; } -static void +static VkResult radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass, const VkRenderPassBeginInfo *info) @@ -1686,7 +1958,7 @@ radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, if (pass->attachment_count == 0) { state->attachments = NULL; - return; + return VK_SUCCESS; } state->attachments = vk_alloc(&cmd_buffer->pool->alloc, @@ -1694,8 +1966,8 @@ radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, sizeof(state->attachments[0]), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (state->attachments == NULL) { - /* FIXME: Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ - abort(); + cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; + return cmd_buffer->record_result; } for (uint32_t i = 0; i < pass->attachment_count; ++i) { @@ -1713,6 +1985,9 @@ radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && + att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE) + clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; } if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { @@ -1721,6 +1996,7 @@ radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, } state->attachments[i].pending_clear_aspects = clear_aspects; + state->attachments[i].cleared_views = 0; if (clear_aspects && info) { assert(info->clearValueCount > i); state->attachments[i].clear_value = info->pClearValues[i]; @@ -1728,6 +2004,8 @@ radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, state->attachments[i].current_layout = att->initial_layout; } + + return VK_SUCCESS; } VkResult radv_AllocateCommandBuffers( @@ -1752,12 +2030,11 @@ VkResult radv_AllocateCommandBuffers( list_del(&cmd_buffer->pool_link); list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); - radv_reset_cmd_buffer(cmd_buffer); + result = radv_reset_cmd_buffer(cmd_buffer); cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC; cmd_buffer->level = pAllocateInfo->level; pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer); - result = VK_SUCCESS; } else { result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, &pCommandBuffers[i]); @@ -1798,19 +2075,18 @@ VkResult radv_ResetCommandBuffer( VkCommandBufferResetFlags flags) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - radv_reset_cmd_buffer(cmd_buffer); - return VK_SUCCESS; + return radv_reset_cmd_buffer(cmd_buffer); } static void emit_gfx_buffer_state(struct radv_cmd_buffer *cmd_buffer) { struct radv_device *device = cmd_buffer->device; if (device->gfx_init) { - uint64_t va = device->ws->buffer_get_va(device->gfx_init); + uint64_t va = radv_buffer_get_va(device->gfx_init); device->ws->cs_add_buffer(cmd_buffer->cs, device->gfx_init, 8); radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); radeon_emit(cmd_buffer->cs, va); - radeon_emit(cmd_buffer->cs, (va >> 32) & 0xffff); + radeon_emit(cmd_buffer->cs, va >> 32); radeon_emit(cmd_buffer->cs, device->gfx_init_size_dw & 0xffff); } else si_init_config(cmd_buffer); @@ -1821,10 +2097,15 @@ VkResult radv_BeginCommandBuffer( const VkCommandBufferBeginInfo *pBeginInfo) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - radv_reset_cmd_buffer(cmd_buffer); + VkResult result; + + result = radv_reset_cmd_buffer(cmd_buffer); + if (result != VK_SUCCESS) + return result; memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); cmd_buffer->state.last_primitive_reset_en = -1; + cmd_buffer->usage_flags = pBeginInfo->flags; /* setup initial configuration into command buffer */ if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { @@ -1843,18 +2124,22 @@ VkResult radv_BeginCommandBuffer( } if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { + assert(pBeginInfo->pInheritanceInfo); cmd_buffer->state.framebuffer = radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer); cmd_buffer->state.pass = radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); struct radv_subpass *subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; - radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL); + result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL); + if (result != VK_SUCCESS) + return result; + radv_cmd_buffer_set_subpass(cmd_buffer, subpass, false); } radv_cmd_buffer_trace_emit(cmd_buffer); - return VK_SUCCESS; + return result; } void radv_CmdBindVertexBuffers( @@ -1870,12 +2155,13 @@ void radv_CmdBindVertexBuffers( /* We have to defer setting up vertex buffer since we need the buffer * stride from the pipeline. */ - assert(firstBinding + bindingCount < MAX_VBS); + assert(firstBinding + bindingCount <= MAX_VBS); for (uint32_t i = 0; i < bindingCount; i++) { vb[firstBinding + i].buffer = radv_buffer_from_handle(pBuffers[i]); vb[firstBinding + i].offset = pOffsets[i]; - cmd_buffer->state.vb_dirty |= 1 << (firstBinding + i); } + + cmd_buffer->state.vb_dirty = true; } void radv_CmdBindIndexBuffer( @@ -1885,12 +2171,16 @@ void radv_CmdBindIndexBuffer( VkIndexType indexType) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer); - cmd_buffer->state.index_buffer = radv_buffer_from_handle(buffer); - cmd_buffer->state.index_offset = offset; cmd_buffer->state.index_type = indexType; /* vk matches hw */ + cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo); + cmd_buffer->state.index_va += index_buffer->offset + offset; + + int index_size_shift = cmd_buffer->state.index_type ? 2 : 1; + cmd_buffer->state.max_index_count = (index_buffer->size - offset) >> index_size_shift; cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; - cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, cmd_buffer->state.index_buffer->bo, 8); + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, index_buffer->bo, 8); } @@ -1901,10 +2191,12 @@ void radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys *ws = cmd_buffer->device->ws; cmd_buffer->state.descriptors[idx] = set; - cmd_buffer->state.descriptors_dirty |= (1 << idx); + cmd_buffer->state.descriptors_dirty |= (1u << idx); if (!set) return; + assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); + for (unsigned j = 0; j < set->layout->buffer_count; ++j) if (set->descriptors[j]) ws->cs_add_buffer(cmd_buffer->cs, set->descriptors[j], 7); @@ -1971,7 +2263,7 @@ static bool radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, if (!set->mapped_ptr) { cmd_buffer->push_descriptors.capacity = 0; - cmd_buffer->record_fail = true; + cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; return false; } @@ -1981,6 +2273,40 @@ static bool radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, return true; } +void radv_meta_push_descriptor_set( + struct radv_cmd_buffer* cmd_buffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipelineLayout _layout, + uint32_t set, + uint32_t descriptorWriteCount, + const VkWriteDescriptorSet* pDescriptorWrites) +{ + RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); + struct radv_descriptor_set *push_set = &cmd_buffer->meta_push_descriptors; + unsigned bo_offset; + + assert(set == 0); + assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); + + push_set->size = layout->set[set].layout->size; + push_set->layout = layout->set[set].layout; + + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->size, 32, + &bo_offset, + (void**) &push_set->mapped_ptr)) + return; + + push_set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); + push_set->va += bo_offset; + + radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer, + radv_descriptor_set_to_handle(push_set), + descriptorWriteCount, pDescriptorWrites, 0, NULL); + + cmd_buffer->state.descriptors[set] = push_set; + cmd_buffer->state.descriptors_dirty |= (1u << set); +} + void radv_CmdPushDescriptorSetKHR( VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, @@ -2003,7 +2329,7 @@ void radv_CmdPushDescriptorSetKHR( descriptorWriteCount, pDescriptorWrites, 0, NULL); cmd_buffer->state.descriptors[set] = push_set; - cmd_buffer->state.descriptors_dirty |= (1 << set); + cmd_buffer->state.descriptors_dirty |= (1u << set); cmd_buffer->state.push_descriptors_dirty = true; } @@ -2027,7 +2353,7 @@ void radv_CmdPushDescriptorSetWithTemplateKHR( descriptorUpdateTemplate, pData); cmd_buffer->state.descriptors[set] = push_set; - cmd_buffer->state.descriptors_dirty |= (1 << set); + cmd_buffer->state.descriptors_dirty |= (1u << set); cmd_buffer->state.push_descriptors_dirty = true; } @@ -2048,13 +2374,16 @@ VkResult radv_EndCommandBuffer( { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) + if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) { + if (cmd_buffer->device->physical_device->rad_info.chip_class == SI) + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2; si_emit_cache_flush(cmd_buffer); + } - if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) || - cmd_buffer->record_fail) + if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs)) return VK_ERROR_OUT_OF_DEVICE_MEMORY; - return VK_SUCCESS; + + return cmd_buffer->record_result; } static void @@ -2071,9 +2400,10 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->state.emitted_compute_pipeline = pipeline; compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; - va = ws->buffer_get_va(compute_shader->bo); + va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset; ws->cs_add_buffer(cmd_buffer->cs, compute_shader->bo, 8); + radv_emit_prefetch(cmd_buffer, va, compute_shader->code_size); MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 16); @@ -2105,8 +2435,16 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2])); assert(cmd_buffer->cs->cdw <= cdw_max); + radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE); } +static void radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer) +{ + for (unsigned i = 0; i < MAX_SETS; i++) { + if (cmd_buffer->state.descriptors[i]) + cmd_buffer->state.descriptors_dirty |= (1u << i); + } +} void radv_CmdBindPipeline( VkCommandBuffer commandBuffer, @@ -2116,10 +2454,7 @@ void radv_CmdBindPipeline( RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline); - for (unsigned i = 0; i < MAX_SETS; i++) { - if (cmd_buffer->state.descriptors[i]) - cmd_buffer->state.descriptors_dirty |= (1 << i); - } + radv_mark_descriptor_sets_dirty(cmd_buffer); switch (pipelineBindPoint) { case VK_PIPELINE_BIND_POINT_COMPUTE: @@ -2128,7 +2463,9 @@ void radv_CmdBindPipeline( break; case VK_PIPELINE_BIND_POINT_GRAPHICS: cmd_buffer->state.pipeline = pipeline; - cmd_buffer->state.vertex_descriptors_dirty = true; + if (!pipeline) + break; + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE; cmd_buffer->push_constant_stages |= pipeline->active_stages; @@ -2168,10 +2505,10 @@ void radv_CmdSetViewport( const VkViewport* pViewports) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - const uint32_t total_count = firstViewport + viewportCount; - if (cmd_buffer->state.dynamic.viewport.count < total_count) - cmd_buffer->state.dynamic.viewport.count = total_count; + + assert(firstViewport < MAX_VIEWPORTS); + assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); memcpy(cmd_buffer->state.dynamic.viewport.viewports + firstViewport, pViewports, viewportCount * sizeof(*pViewports)); @@ -2186,10 +2523,10 @@ void radv_CmdSetScissor( const VkRect2D* pScissors) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - const uint32_t total_count = firstScissor + scissorCount; - if (cmd_buffer->state.dynamic.scissor.count < total_count) - cmd_buffer->state.dynamic.scissor.count = total_count; + + assert(firstScissor < MAX_SCISSORS); + assert(total_count >= 1 && total_count <= MAX_SCISSORS); memcpy(cmd_buffer->state.dynamic.scissor.scissors + firstScissor, pScissors, scissorCount * sizeof(*pScissors)); @@ -2290,7 +2627,6 @@ void radv_CmdSetStencilReference( cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE; } - void radv_CmdExecuteCommands( VkCommandBuffer commandBuffer, uint32_t commandBufferCount, @@ -2325,16 +2661,18 @@ void radv_CmdExecuteCommands( assert(secondary->ring_offsets_idx == primary->ring_offsets_idx); } primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs); + + primary->state.emitted_pipeline = secondary->state.emitted_pipeline; + primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline; + primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en; + primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index; } - /* if we execute secondary we need to re-emit out pipelines */ + /* if we execute secondary we need to mark some stuff to reset dirty */ if (commandBufferCount) { - primary->state.emitted_pipeline = NULL; - primary->state.emitted_compute_pipeline = NULL; primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE; primary->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_ALL; - primary->state.last_primitive_reset_en = -1; - primary->state.last_primitive_reset_index = 0; + radv_mark_descriptor_sets_dirty(primary); } } @@ -2398,10 +2736,13 @@ VkResult radv_ResetCommandPool( VkCommandPoolResetFlags flags) { RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); + VkResult result; list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link) { - radv_reset_cmd_buffer(cmd_buffer); + result = radv_reset_cmd_buffer(cmd_buffer); + if (result != VK_SUCCESS) + return result; } return VK_SUCCESS; @@ -2434,11 +2775,15 @@ void radv_CmdBeginRenderPass( MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 2048); + MAYBE_UNUSED VkResult result; cmd_buffer->state.framebuffer = framebuffer; cmd_buffer->state.pass = pass; cmd_buffer->state.render_area = pRenderPassBegin->renderArea; - radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin); + + result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin); + if (result != VK_SUCCESS) + return; radv_cmd_buffer_set_subpass(cmd_buffer, pass->subpasses, true); assert(cmd_buffer->cs->cdw <= cdw_max); @@ -2461,6 +2806,38 @@ void radv_CmdNextSubpass( radv_cmd_buffer_clear_subpass(cmd_buffer); } +static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) { + if (!pipeline->shaders[stage]) + continue; + struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX); + if (loc->sgpr_idx == -1) + continue; + uint32_t base_reg = radv_shader_stage_to_user_data_0(stage, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index); + + } + if (pipeline->gs_copy_shader) { + struct ac_userdata_info *loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX]; + if (loc->sgpr_idx != -1) { + uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0; + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index); + } + } +} + +static void +radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, + uint32_t vertex_count) +{ + radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating)); + radeon_emit(cmd_buffer->cs, vertex_count); + radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | + S_0287F0_USE_OPAQUE(0)); +} + void radv_CmdDraw( VkCommandBuffer commandBuffer, uint32_t vertexCount, @@ -2472,29 +2849,47 @@ void radv_CmdDraw( radv_cmd_buffer_flush_state(cmd_buffer, false, (instanceCount > 1), false, vertexCount); - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 10); + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 20 * MAX_VIEWS); - struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX, - AC_UD_VS_BASE_VERTEX_START_INSTANCE); - if (loc->sgpr_idx != -1) { - uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(cmd_buffer->state.pipeline), - radv_pipeline_has_tess(cmd_buffer->state.pipeline)); - radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 3); - radeon_emit(cmd_buffer->cs, firstVertex); - radeon_emit(cmd_buffer->cs, firstInstance); + assert(cmd_buffer->state.pipeline->graphics.vtx_base_sgpr); + radeon_set_sh_reg_seq(cmd_buffer->cs, cmd_buffer->state.pipeline->graphics.vtx_base_sgpr, + cmd_buffer->state.pipeline->graphics.vtx_emit_num); + radeon_emit(cmd_buffer->cs, firstVertex); + radeon_emit(cmd_buffer->cs, firstInstance); + if (cmd_buffer->state.pipeline->graphics.vtx_emit_num == 3) radeon_emit(cmd_buffer->cs, 0); - } - radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, 0)); + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, cmd_buffer->state.predicating)); radeon_emit(cmd_buffer->cs, instanceCount); - radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, 0)); - radeon_emit(cmd_buffer->cs, vertexCount); - radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | - S_0287F0_USE_OPAQUE(0)); + if (!cmd_buffer->state.subpass->view_mask) { + radv_cs_emit_draw_packet(cmd_buffer, vertexCount); + } else { + unsigned i; + for_each_bit(i, cmd_buffer->state.subpass->view_mask) { + radv_emit_view_index(cmd_buffer, i); + + radv_cs_emit_draw_packet(cmd_buffer, vertexCount); + } + } assert(cmd_buffer->cs->cdw <= cdw_max); - radv_cmd_buffer_trace_emit(cmd_buffer); + radv_cmd_buffer_after_draw(cmd_buffer); +} + + +static void +radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, + uint64_t index_va, + uint32_t index_count) +{ + radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, false)); + radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count); + radeon_emit(cmd_buffer->cs, index_va); + radeon_emit(cmd_buffer->cs, (index_va >> 32UL) & 0xFF); + radeon_emit(cmd_buffer->cs, index_count); + radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA); } void radv_CmdDrawIndexed( @@ -2507,40 +2902,85 @@ void radv_CmdDrawIndexed( { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); int index_size = cmd_buffer->state.index_type ? 4 : 2; - uint32_t index_max_size = (cmd_buffer->state.index_buffer->size - cmd_buffer->state.index_offset) / index_size; uint64_t index_va; radv_cmd_buffer_flush_state(cmd_buffer, true, (instanceCount > 1), false, indexCount); - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15); + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 26 * MAX_VIEWS); - radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); - radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type); + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_03090C_VGT_INDEX_TYPE, + 2, cmd_buffer->state.index_type); + } else { + radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); + radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type); + } - struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX, - AC_UD_VS_BASE_VERTEX_START_INSTANCE); - if (loc->sgpr_idx != -1) { - uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(cmd_buffer->state.pipeline), - radv_pipeline_has_tess(cmd_buffer->state.pipeline)); - radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 3); - radeon_emit(cmd_buffer->cs, vertexOffset); - radeon_emit(cmd_buffer->cs, firstInstance); + assert(cmd_buffer->state.pipeline->graphics.vtx_base_sgpr); + radeon_set_sh_reg_seq(cmd_buffer->cs, cmd_buffer->state.pipeline->graphics.vtx_base_sgpr, + cmd_buffer->state.pipeline->graphics.vtx_emit_num); + radeon_emit(cmd_buffer->cs, vertexOffset); + radeon_emit(cmd_buffer->cs, firstInstance); + if (cmd_buffer->state.pipeline->graphics.vtx_emit_num == 3) radeon_emit(cmd_buffer->cs, 0); - } + radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, 0)); radeon_emit(cmd_buffer->cs, instanceCount); - index_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->state.index_buffer->bo); - index_va += firstIndex * index_size + cmd_buffer->state.index_buffer->offset + cmd_buffer->state.index_offset; - radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, false)); - radeon_emit(cmd_buffer->cs, index_max_size); - radeon_emit(cmd_buffer->cs, index_va); - radeon_emit(cmd_buffer->cs, (index_va >> 32UL) & 0xFF); - radeon_emit(cmd_buffer->cs, indexCount); - radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA); + index_va = cmd_buffer->state.index_va; + index_va += firstIndex * index_size; + if (!cmd_buffer->state.subpass->view_mask) { + radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, indexCount); + } else { + unsigned i; + for_each_bit(i, cmd_buffer->state.subpass->view_mask) { + radv_emit_view_index(cmd_buffer, i); + + radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, indexCount); + } + } assert(cmd_buffer->cs->cdw <= cdw_max); - radv_cmd_buffer_trace_emit(cmd_buffer); + radv_cmd_buffer_after_draw(cmd_buffer); +} + +static void +radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, + bool indexed, + uint32_t draw_count, + uint64_t count_va, + uint32_t stride) +{ + struct radeon_winsys_cs *cs = cmd_buffer->cs; + unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA + : V_0287F0_DI_SRC_SEL_AUTO_INDEX; + bool draw_id_enable = cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX]->info.info.vs.needs_draw_id; + uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr; + assert(base_reg); + + if (draw_count == 1 && !count_va && !draw_id_enable) { + radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : + PKT3_DRAW_INDIRECT, 3, false)); + radeon_emit(cs, 0); + radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, di_src_sel); + } else { + radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : + PKT3_DRAW_INDIRECT_MULTI, + 8, false)); + radeon_emit(cs, 0); + radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, (((base_reg + 8) - SI_SH_REG_OFFSET) >> 2) | + S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) | + S_2C3_COUNT_INDIRECT_ENABLE(!!count_va)); + radeon_emit(cs, draw_count); /* count */ + radeon_emit(cs, count_va); /* count_addr */ + radeon_emit(cs, count_va >> 32); + radeon_emit(cs, stride); /* stride */ + radeon_emit(cs, di_src_sel); + } } static void @@ -2556,14 +2996,13 @@ radv_emit_indirect_draw(struct radv_cmd_buffer *cmd_buffer, RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); RADV_FROM_HANDLE(radv_buffer, count_buffer, _count_buffer); struct radeon_winsys_cs *cs = cmd_buffer->cs; - unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA - : V_0287F0_DI_SRC_SEL_AUTO_INDEX; - uint64_t indirect_va = cmd_buffer->device->ws->buffer_get_va(buffer->bo); + + uint64_t indirect_va = radv_buffer_get_va(buffer->bo); indirect_va += offset + buffer->offset; uint64_t count_va = 0; if (count_buffer) { - count_va = cmd_buffer->device->ws->buffer_get_va(count_buffer->bo); + count_va = radv_buffer_get_va(count_buffer->bo); count_va += count_offset + count_buffer->offset; } @@ -2572,31 +3011,22 @@ radv_emit_indirect_draw(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->device->ws->cs_add_buffer(cs, buffer->bo, 8); - struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX, - AC_UD_VS_BASE_VERTEX_START_INSTANCE); - uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_VERTEX, radv_pipeline_has_gs(cmd_buffer->state.pipeline), - radv_pipeline_has_tess(cmd_buffer->state.pipeline)); - assert(loc->sgpr_idx != -1); radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0)); radeon_emit(cs, 1); radeon_emit(cs, indirect_va); radeon_emit(cs, indirect_va >> 32); - radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : - PKT3_DRAW_INDIRECT_MULTI, - 8, false)); - radeon_emit(cs, 0); - radeon_emit(cs, ((base_reg + loc->sgpr_idx * 4) - SI_SH_REG_OFFSET) >> 2); - radeon_emit(cs, ((base_reg + (loc->sgpr_idx + 1) * 4) - SI_SH_REG_OFFSET) >> 2); - radeon_emit(cs, (((base_reg + (loc->sgpr_idx + 2) * 4) - SI_SH_REG_OFFSET) >> 2) | - S_2C3_DRAW_INDEX_ENABLE(1) | - S_2C3_COUNT_INDIRECT_ENABLE(!!count_va)); - radeon_emit(cs, draw_count); /* count */ - radeon_emit(cs, count_va); /* count_addr */ - radeon_emit(cs, count_va >> 32); - radeon_emit(cs, stride); /* stride */ - radeon_emit(cs, di_src_sel); - radv_cmd_buffer_trace_emit(cmd_buffer); + if (!cmd_buffer->state.subpass->view_mask) { + radv_cs_emit_indirect_draw_packet(cmd_buffer, indexed, draw_count, count_va, stride); + } else { + unsigned i; + for_each_bit(i, cmd_buffer->state.subpass->view_mask) { + radv_emit_view_index(cmd_buffer, i); + + radv_cs_emit_indirect_draw_packet(cmd_buffer, indexed, draw_count, count_va, stride); + } + } + radv_cmd_buffer_after_draw(cmd_buffer); } static void @@ -2612,7 +3042,7 @@ radv_cmd_draw_indirect_count(VkCommandBuffer command radv_cmd_buffer_flush_state(cmd_buffer, false, false, true, 0); MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, - cmd_buffer->cs, 14); + cmd_buffer->cs, 24 * MAX_VIEWS); radv_emit_indirect_draw(cmd_buffer, buffer, offset, countBuffer, countBufferOffset, maxDrawCount, stride, false); @@ -2631,15 +3061,12 @@ radv_cmd_draw_indexed_indirect_count( uint32_t stride) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - int index_size = cmd_buffer->state.index_type ? 4 : 2; - uint32_t index_max_size = (cmd_buffer->state.index_buffer->size - cmd_buffer->state.index_offset) / index_size; uint64_t index_va; radv_cmd_buffer_flush_state(cmd_buffer, true, false, true, 0); - index_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->state.index_buffer->bo); - index_va += cmd_buffer->state.index_buffer->offset + cmd_buffer->state.index_offset; + index_va = cmd_buffer->state.index_va; - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 21); + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 31 * MAX_VIEWS); radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type); @@ -2649,7 +3076,7 @@ radv_cmd_draw_indexed_indirect_count( radeon_emit(cmd_buffer->cs, index_va >> 32); radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); - radeon_emit(cmd_buffer->cs, index_max_size); + radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count); radv_emit_indirect_draw(cmd_buffer, buffer, offset, countBuffer, countBufferOffset, maxDrawCount, stride, true); @@ -2707,15 +3134,155 @@ void radv_CmdDrawIndexedIndirectCountAMD( maxDrawCount, stride); } +struct radv_dispatch_info { + /** + * Determine the layout of the grid (in block units) to be used. + */ + uint32_t blocks[3]; + + /** + * Whether it's an unaligned compute dispatch. + */ + bool unaligned; + + /** + * Indirect compute parameters resource. + */ + struct radv_buffer *indirect; + uint64_t indirect_offset; +}; + +static void +radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, + const struct radv_dispatch_info *info) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; + struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; + struct radeon_winsys *ws = cmd_buffer->device->ws; + struct radeon_winsys_cs *cs = cmd_buffer->cs; + struct ac_userdata_info *loc; + uint8_t grid_used; + + grid_used = compute_shader->info.info.cs.grid_components_used; + + loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, + AC_UD_CS_GRID_SIZE); + + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(ws, cs, 25); + + if (info->indirect) { + uint64_t va = radv_buffer_get_va(info->indirect->bo); + + va += info->indirect->offset + info->indirect_offset; + + ws->cs_add_buffer(cs, info->indirect->bo, 8); + + if (loc->sgpr_idx != -1) { + for (unsigned i = 0; i < grid_used; ++i) { + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_REG)); + radeon_emit(cs, (va + 4 * i)); + radeon_emit(cs, (va + 4 * i) >> 32); + radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 + + loc->sgpr_idx * 4) >> 2) + i); + radeon_emit(cs, 0); + } + } + + if (radv_cmd_buffer_uses_mec(cmd_buffer)) { + radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, 1); + } else { + radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, 1); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + + radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, 0); + radeon_emit(cs, 1); + } + } else { + unsigned blocks[3] = { info->blocks[0], info->blocks[1], info->blocks[2] }; + unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1); + + if (info->unaligned) { + unsigned *cs_block_size = compute_shader->info.cs.block_size; + unsigned remainder[3]; + + /* If aligned, these should be an entire block size, + * not 0. + */ + remainder[0] = blocks[0] + cs_block_size[0] - + align_u32_npot(blocks[0], cs_block_size[0]); + remainder[1] = blocks[1] + cs_block_size[1] - + align_u32_npot(blocks[1], cs_block_size[1]); + remainder[2] = blocks[2] + cs_block_size[2] - + align_u32_npot(blocks[2], cs_block_size[2]); + + blocks[0] = round_up_u32(blocks[0], cs_block_size[0]); + blocks[1] = round_up_u32(blocks[1], cs_block_size[1]); + blocks[2] = round_up_u32(blocks[2], cs_block_size[2]); + + radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); + radeon_emit(cs, + S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) | + S_00B81C_NUM_THREAD_PARTIAL(remainder[0])); + radeon_emit(cs, + S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) | + S_00B81C_NUM_THREAD_PARTIAL(remainder[1])); + radeon_emit(cs, + S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) | + S_00B81C_NUM_THREAD_PARTIAL(remainder[2])); + + dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); + } + + if (loc->sgpr_idx != -1) { + assert(!loc->indirect); + assert(loc->num_sgprs == grid_used); + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + + loc->sgpr_idx * 4, grid_used); + radeon_emit(cs, blocks[0]); + if (grid_used > 1) + radeon_emit(cs, blocks[1]); + if (grid_used > 2) + radeon_emit(cs, blocks[2]); + } + + radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | + PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, blocks[0]); + radeon_emit(cs, blocks[1]); + radeon_emit(cs, blocks[2]); + radeon_emit(cs, dispatch_initiator); + } + + assert(cmd_buffer->cs->cdw <= cdw_max); +} + static void -radv_flush_compute_state(struct radv_cmd_buffer *cmd_buffer) +radv_dispatch(struct radv_cmd_buffer *cmd_buffer, + const struct radv_dispatch_info *info) { radv_emit_compute_pipeline(cmd_buffer); - radv_flush_descriptors(cmd_buffer, cmd_buffer->state.compute_pipeline, - VK_SHADER_STAGE_COMPUTE_BIT); + + radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT); radv_flush_constants(cmd_buffer, cmd_buffer->state.compute_pipeline, VK_SHADER_STAGE_COMPUTE_BIT); + si_emit_cache_flush(cmd_buffer); + + radv_emit_dispatch_packets(cmd_buffer, info); + + radv_cmd_buffer_after_draw(cmd_buffer); } void radv_CmdDispatch( @@ -2725,31 +3292,13 @@ void radv_CmdDispatch( uint32_t z) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_dispatch_info info = {}; - radv_flush_compute_state(cmd_buffer); - - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 10); - - struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline, - MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE); - if (loc->sgpr_idx != -1) { - assert(!loc->indirect); - assert(loc->num_sgprs == 3); - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3); - radeon_emit(cmd_buffer->cs, x); - radeon_emit(cmd_buffer->cs, y); - radeon_emit(cmd_buffer->cs, z); - } - - radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cmd_buffer->cs, x); - radeon_emit(cmd_buffer->cs, y); - radeon_emit(cmd_buffer->cs, z); - radeon_emit(cmd_buffer->cs, 1); + info.blocks[0] = x; + info.blocks[1] = y; + info.blocks[2] = z; - assert(cmd_buffer->cs->cdw <= cdw_max); - radv_cmd_buffer_trace_emit(cmd_buffer); + radv_dispatch(cmd_buffer, &info); } void radv_CmdDispatchIndirect( @@ -2759,49 +3308,12 @@ void radv_CmdDispatchIndirect( { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); - uint64_t va = cmd_buffer->device->ws->buffer_get_va(buffer->bo); - va += buffer->offset + offset; - - cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, buffer->bo, 8); - - radv_flush_compute_state(cmd_buffer); - - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 25); - struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline, - MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE); - if (loc->sgpr_idx != -1) { - for (unsigned i = 0; i < 3; ++i) { - radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | - COPY_DATA_DST_SEL(COPY_DATA_REG)); - radeon_emit(cmd_buffer->cs, (va + 4 * i)); - radeon_emit(cmd_buffer->cs, (va + 4 * i) >> 32); - radeon_emit(cmd_buffer->cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i); - radeon_emit(cmd_buffer->cs, 0); - } - } - - if (radv_cmd_buffer_uses_mec(cmd_buffer)) { - radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cmd_buffer->cs, va); - radeon_emit(cmd_buffer->cs, va >> 32); - radeon_emit(cmd_buffer->cs, 1); - } else { - radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_BASE, 2, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cmd_buffer->cs, 1); - radeon_emit(cmd_buffer->cs, va); - radeon_emit(cmd_buffer->cs, va >> 32); + struct radv_dispatch_info info = {}; - radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cmd_buffer->cs, 0); - radeon_emit(cmd_buffer->cs, 1); - } + info.indirect = buffer; + info.indirect_offset = offset; - assert(cmd_buffer->cs->cdw <= cdw_max); - radv_cmd_buffer_trace_emit(cmd_buffer); + radv_dispatch(cmd_buffer, &info); } void radv_unaligned_dispatch( @@ -2810,52 +3322,14 @@ void radv_unaligned_dispatch( uint32_t y, uint32_t z) { - struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; - struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; - uint32_t blocks[3], remainder[3]; - - blocks[0] = round_up_u32(x, compute_shader->info.cs.block_size[0]); - blocks[1] = round_up_u32(y, compute_shader->info.cs.block_size[1]); - blocks[2] = round_up_u32(z, compute_shader->info.cs.block_size[2]); - - /* If aligned, these should be an entire block size, not 0 */ - remainder[0] = x + compute_shader->info.cs.block_size[0] - align_u32_npot(x, compute_shader->info.cs.block_size[0]); - remainder[1] = y + compute_shader->info.cs.block_size[1] - align_u32_npot(y, compute_shader->info.cs.block_size[1]); - remainder[2] = z + compute_shader->info.cs.block_size[2] - align_u32_npot(z, compute_shader->info.cs.block_size[2]); - - radv_flush_compute_state(cmd_buffer); - - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15); - - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); - radeon_emit(cmd_buffer->cs, - S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]) | - S_00B81C_NUM_THREAD_PARTIAL(remainder[0])); - radeon_emit(cmd_buffer->cs, - S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]) | - S_00B81C_NUM_THREAD_PARTIAL(remainder[1])); - radeon_emit(cmd_buffer->cs, - S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]) | - S_00B81C_NUM_THREAD_PARTIAL(remainder[2])); + struct radv_dispatch_info info = {}; - struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline, - MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE); - if (loc->sgpr_idx != -1) { - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3); - radeon_emit(cmd_buffer->cs, blocks[0]); - radeon_emit(cmd_buffer->cs, blocks[1]); - radeon_emit(cmd_buffer->cs, blocks[2]); - } - radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cmd_buffer->cs, blocks[0]); - radeon_emit(cmd_buffer->cs, blocks[1]); - radeon_emit(cmd_buffer->cs, blocks[2]); - radeon_emit(cmd_buffer->cs, S_00B800_COMPUTE_SHADER_EN(1) | - S_00B800_PARTIAL_TG_EN(1)); + info.blocks[0] = x; + info.blocks[1] = y; + info.blocks[2] = z; + info.unaligned = 1; - assert(cmd_buffer->cs->cdw <= cdw_max); - radv_cmd_buffer_trace_emit(cmd_buffer); + radv_dispatch(cmd_buffer, &info); } void radv_CmdEndRenderPass( @@ -2881,10 +3355,16 @@ void radv_CmdEndRenderPass( cmd_buffer->state.framebuffer = NULL; } - +/* + * For HTILE we have the following interesting clear words: + * 0x0000030f: Uncompressed. + * 0xfffffff0: Clear depth to 1.0 + * 0x00000000: Clear depth to 0.0 + */ static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, - const VkImageSubresourceRange *range) + const VkImageSubresourceRange *range, + uint32_t clear_word) { assert(range->baseMipLevel == 0); assert(range->levelCount == 1 || range->levelCount == VK_REMAINING_ARRAY_LAYERS); @@ -2896,7 +3376,7 @@ static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; - radv_fill_buffer(cmd_buffer, image->bo, offset, size, 0xffffffff); + radv_fill_buffer(cmd_buffer, image->bo, offset, size, clear_word); cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | @@ -2908,27 +3388,27 @@ static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffe struct radv_image *image, VkImageLayout src_layout, VkImageLayout dst_layout, + unsigned src_queue_mask, + unsigned dst_queue_mask, const VkImageSubresourceRange *range, VkImageAspectFlags pending_clears) { if (dst_layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL && (pending_clears & vk_format_aspects(image->vk_format)) == vk_format_aspects(image->vk_format) && cmd_buffer->state.render_area.offset.x == 0 && cmd_buffer->state.render_area.offset.y == 0 && - cmd_buffer->state.render_area.extent.width == image->extent.width && - cmd_buffer->state.render_area.extent.height == image->extent.height) { + cmd_buffer->state.render_area.extent.width == image->info.width && + cmd_buffer->state.render_area.extent.height == image->info.height) { /* The clear will initialize htile. */ return; } else if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED && - radv_layout_has_htile(image, dst_layout)) { + radv_layout_has_htile(image, dst_layout, dst_queue_mask)) { /* TODO: merge with the clear if applicable */ - radv_initialize_htile(cmd_buffer, image, range); - } else if (!radv_layout_has_htile(image, src_layout) && - radv_layout_has_htile(image, dst_layout)) { - radv_initialize_htile(cmd_buffer, image, range); - } else if ((radv_layout_has_htile(image, src_layout) && - !radv_layout_has_htile(image, dst_layout)) || - (radv_layout_is_htile_compressed(image, src_layout) && - !radv_layout_is_htile_compressed(image, dst_layout))) { + radv_initialize_htile(cmd_buffer, image, range, 0); + } else if (!radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) && + radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) { + radv_initialize_htile(cmd_buffer, image, range, 0xffffffff); + } else if (radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) && + !radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) { VkImageSubresourceRange local_range = *range; local_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; local_range.baseMipLevel = 0; @@ -3044,7 +3524,9 @@ static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, if (image->surface.htile_size) radv_handle_depth_image_transition(cmd_buffer, image, src_layout, - dst_layout, range, pending_clears); + dst_layout, src_queue_mask, + dst_queue_mask, range, + pending_clears); if (image->cmask.size) radv_handle_cmask_image_transition(cmd_buffer, image, src_layout, @@ -3118,32 +3600,21 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer, unsigned value) { struct radeon_winsys_cs *cs = cmd_buffer->cs; - uint64_t va = cmd_buffer->device->ws->buffer_get_va(event->bo); + uint64_t va = radv_buffer_get_va(event->bo); cmd_buffer->device->ws->cs_add_buffer(cs, event->bo, 8); - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 12); + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 18); /* TODO: this is overkill. Probably should figure something out from * the stage mask. */ - if (cmd_buffer->device->physical_device->rad_info.chip_class == CIK) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | - EVENT_INDEX(5)); - radeon_emit(cs, va); - radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1)); - radeon_emit(cs, 2); - radeon_emit(cs, 0); - } - - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | - EVENT_INDEX(5)); - radeon_emit(cs, va); - radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1)); - radeon_emit(cs, value); - radeon_emit(cs, 0); + si_cs_emit_write_event_eop(cs, + cmd_buffer->state.predicating, + cmd_buffer->device->physical_device->rad_info.chip_class, + false, + EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, + 1, va, 2, value); assert(cmd_buffer->cs->cdw <= cdw_max); } @@ -3185,20 +3656,13 @@ void radv_CmdWaitEvents(VkCommandBuffer commandBuffer, for (unsigned i = 0; i < eventCount; ++i) { RADV_FROM_HANDLE(radv_event, event, pEvents[i]); - uint64_t va = cmd_buffer->device->ws->buffer_get_va(event->bo); + uint64_t va = radv_buffer_get_va(event->bo); cmd_buffer->device->ws->cs_add_buffer(cs, event->bo, 8); MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7); - radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); - radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit(cs, 1); /* reference value */ - radeon_emit(cs, 0xffffffff); /* mask */ - radeon_emit(cs, 4); /* poll interval */ - + si_emit_wait_fence(cs, false, va, 1, 0xffffffff); assert(cmd_buffer->cs->cdw <= cdw_max); }