From 2397a721291457c146c7f4bcd48adcb3b2d979bd Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 10 Dec 2014 09:13:59 -0500 Subject: [PATCH] radeonsi: Enable VGPR spilling for all shader types v5 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit v2: - Only emit write SPI_TMPRING_SIZE once per packet. - Use context global scratch buffer. v3: - Patch shaders using WRITE_DATA packet instead of map/unmap. - Emit ICACHE_FLUSH, CS_PARTIAL_FLUSH, PS_PARTIAL_FLUSH, and VS_PARTIAL_FLUSH when patching shaders. v4: - Code cleanups. - Remove unnecessary multiplies. v5: - Patch shaders in system memory and re-upload to vram. Reviewed-by: Michel Dänzer --- src/gallium/drivers/radeonsi/si_compute.c | 44 +----- src/gallium/drivers/radeonsi/si_hw_context.c | 1 + src/gallium/drivers/radeonsi/si_pipe.c | 9 +- src/gallium/drivers/radeonsi/si_pipe.h | 6 + src/gallium/drivers/radeonsi/si_shader.c | 59 ++++++-- src/gallium/drivers/radeonsi/si_shader.h | 6 +- src/gallium/drivers/radeonsi/si_state_draw.c | 14 ++ .../drivers/radeonsi/si_state_shaders.c | 130 +++++++++++++++++- 8 files changed, 217 insertions(+), 52 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 12b91db07c1..5009f699443 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -42,12 +42,6 @@ #define NUM_USER_SGPRS 4 #endif -static const char *scratch_rsrc_dword0_symbol = - "SCRATCH_RSRC_DWORD0"; - -static const char *scratch_rsrc_dword1_symbol = - "SCRATCH_RSRC_DWORD1"; - struct si_compute { struct si_context *ctx; @@ -67,8 +61,6 @@ struct si_compute { #endif }; -static void apply_scratch_relocs(const struct si_screen *sscreen, - struct si_shader *shader, uint64_t scratch_va); static void init_scratch_buffer(struct si_context *sctx, struct si_compute *program) { unsigned scratch_bytes = 0; @@ -83,7 +75,7 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog program->shader.binary.global_symbol_offsets[i]; unsigned scratch_bytes_needed; - si_shader_binary_read_config(&program->shader.binary, + si_shader_binary_read_config(sctx->screen, &program->shader, offset); scratch_bytes_needed = program->shader.scratch_bytes_per_wave; scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed); @@ -106,8 +98,8 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog program->shader.scratch_bytes_per_wave = scratch_bytes; /* Patch the shader with the scratch buffer address. */ - apply_scratch_relocs(sctx->screen, &program->shader, scratch_buffer_va); - + si_shader_apply_scratch_relocs(sctx, + &program->shader, scratch_buffer_va); } static void *si_create_compute_state( @@ -231,30 +223,6 @@ static unsigned compute_num_waves_for_scratch( return scratch_waves; } -static void apply_scratch_relocs(const struct si_screen *sscreen, - struct si_shader *shader, uint64_t scratch_va) { - unsigned i; - uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff; - uint32_t scratch_rsrc_dword1 = - S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) - | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64); - - if (!shader->binary.reloc_count) { - return; - } - - for (i = 0 ; i < shader->binary.reloc_count; i++) { - const struct radeon_shader_reloc *reloc = &shader->binary.relocs[i]; - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { - util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, - &scratch_rsrc_dword0, 4); - } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { - util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, - &scratch_rsrc_dword1, 4); - } - } -} - static void si_launch_grid( struct pipe_context *ctx, const uint *block_layout, const uint *grid_layout, @@ -299,7 +267,7 @@ static void si_launch_grid( #if HAVE_LLVM >= 0x0306 /* Read the config information */ - si_shader_binary_read_config(&program->shader.binary, shader, pc); + si_shader_binary_read_config(sctx->screen, shader, pc); #endif /* Upload the kernel arguments */ @@ -510,13 +478,15 @@ static void si_delete_compute_state(struct pipe_context *ctx, void* state){ LLVMContextDispose(program->llvm_ctx); } #else + FREE(program->shader.binary.config); + FREE(program->shader.binary.rodata); + FREE(program->shader.binary.global_symbol_offsets); si_shader_destroy(ctx, &program->shader); #endif pipe_resource_reference( (struct pipe_resource **)&program->input_buffer, NULL); - radeon_shader_binary_free_members(&program->shader.binary, true); FREE(program); } diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index 21c3ebfc4a5..1cacc2660c3 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -160,4 +160,5 @@ void si_begin_new_cs(struct si_context *ctx) ctx->last_prim = -1; ctx->last_multi_vgt_param = -1; ctx->last_rast_prim = -1; + ctx->emit_scratch_reloc = true; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index e3f8fcf8032..eb2b785de53 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -46,6 +46,7 @@ static void si_destroy_context(struct pipe_context *context) pipe_resource_reference(&sctx->gsvs_ring, NULL); pipe_resource_reference(&sctx->null_const_buf.buffer, NULL); r600_resource_reference(&sctx->border_color_table, NULL); + r600_resource_reference(&sctx->scratch_buffer, NULL); si_pm4_free_state(sctx, sctx->init_config, ~0); si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings); @@ -158,6 +159,12 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void * sctx->null_const_buf.buffer->width0, 0, false); } + /* XXX: This is the maximum value allowed. I'm not sure how to compute + * this for non-cs shaders. Using the wrong value here can result in + * GPU lockups, but the maximum value seems to always work. + */ + sctx->scratch_waves = 32 * sscreen->b.info.max_compute_units; + return &sctx->b.b; fail: si_destroy_context(&sctx->b.b); @@ -525,7 +532,7 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) r600_target = radeon_llvm_get_r600_target(triple); sscreen->tm = LLVMCreateTargetMachine(r600_target, triple, r600_get_llvm_processor_name(sscreen->b.family), - "+DumpCode", LLVMCodeGenLevelDefault, LLVMRelocDefault, + "+DumpCode,+vgpr-spilling", LLVMCodeGenLevelDefault, LLVMRelocDefault, LLVMCodeModelDefault); #endif return &sscreen->b.b; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 6144fb1b2f0..b88f1542c5e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -173,6 +173,7 @@ struct si_context { struct si_buffer_resources const_buffers[SI_NUM_SHADERS]; struct si_buffer_resources rw_buffers[SI_NUM_SHADERS]; struct si_textures_info samplers[SI_NUM_SHADERS]; + struct r600_resource *scratch_buffer; struct r600_resource *border_color_table; unsigned border_color_offset; @@ -220,6 +221,11 @@ struct si_context { int last_prim; int last_multi_vgt_param; int last_rast_prim; + + /* Scratch buffer */ + boolean emit_scratch_reloc; + unsigned scratch_waves; + unsigned spi_tmpring_size; }; /* si_blit.c */ diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 571ce67c7c3..fb1419ddb4d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -32,6 +32,7 @@ #include "gallivm/lp_bld_logic.h" #include "gallivm/lp_bld_arit.h" #include "gallivm/lp_bld_flow.h" +#include "radeon/r600_cs.h" #include "radeon/radeon_llvm.h" #include "radeon/radeon_elf_util.h" #include "radeon/radeon_llvm_emit.h" @@ -46,6 +47,12 @@ #include +static const char *scratch_rsrc_dword0_symbol = + "SCRATCH_RSRC_DWORD0"; + +static const char *scratch_rsrc_dword1_symbol = + "SCRATCH_RSRC_DWORD1"; + struct si_shader_output_values { LLVMValueRef values[4]; @@ -2517,19 +2524,20 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx) } } -void si_shader_binary_read_config(const struct radeon_shader_binary *binary, +void si_shader_binary_read_config(const struct si_screen *sscreen, struct si_shader *shader, unsigned symbol_offset) { unsigned i; const unsigned char *config = - radeon_shader_binary_config_start(binary, symbol_offset); + radeon_shader_binary_config_start(&shader->binary, + symbol_offset); /* XXX: We may be able to emit some of these values directly rather than * extracting fields to be emitted later. */ - for (i = 0; i < binary->config_size_per_symbol; i+= 8) { + for (i = 0; i < shader->binary.config_size_per_symbol; i+= 8) { unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i)); unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4)); switch (reg) { @@ -2549,6 +2557,7 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary, case R_0286CC_SPI_PS_INPUT_ENA: shader->spi_ps_input_ena = value; break; + case R_0286E8_SPI_TMPRING_SIZE: case R_00B860_COMPUTE_TMPRING_SIZE: /* WAVESIZE is in units of 256 dwords. */ shader->scratch_bytes_per_wave = @@ -2562,6 +2571,29 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary, } } +void si_shader_apply_scratch_relocs(struct si_context *sctx, + struct si_shader *shader, + uint64_t scratch_va) +{ + unsigned i; + uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff; + uint32_t scratch_rsrc_dword1 = + S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) + | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64); + + for (i = 0 ; i < shader->binary.reloc_count; i++) { + const struct radeon_shader_reloc *reloc = + &shader->binary.relocs[i]; + if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { + util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, + &scratch_rsrc_dword0, 4); + } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { + util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, + &scratch_rsrc_dword1, 4); + } + } +} + int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader, const struct radeon_shader_binary *binary) @@ -2582,7 +2614,7 @@ int si_shader_binary_read(struct si_screen *sscreen, } } - si_shader_binary_read_config(binary, shader, 0); + si_shader_binary_read_config(sscreen, shader, 0); /* copy new shader */ code_size = binary->code_size + binary->rodata_size; @@ -2610,18 +2642,24 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader, LLVMModuleRef mod) { int r = 0; - struct radeon_shader_binary binary; bool dump = r600_can_dump_shader(&sscreen->b, shader->selector ? shader->selector->tokens : NULL); - memset(&binary, 0, sizeof(binary)); - r = radeon_llvm_compile(mod, &binary, + r = radeon_llvm_compile(mod, &shader->binary, r600_get_llvm_processor_name(sscreen->b.family), dump, sscreen->tm); if (r) { return r; } - r = si_shader_binary_read(sscreen, shader, &binary); - radeon_shader_binary_free_members(&binary, true); + r = si_shader_binary_read(sscreen, shader, &shader->binary); + + FREE(shader->binary.config); + FREE(shader->binary.rodata); + FREE(shader->binary.global_symbol_offsets); + if (shader->scratch_bytes_per_wave == 0) { + FREE(shader->binary.code); + FREE(shader->binary.relocs); + memset(&shader->binary, 0, sizeof(shader->binary)); + } return r; } @@ -2861,4 +2899,7 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader) r600_resource_reference(&shader->scratch_bo, NULL); r600_resource_reference(&shader->bo, NULL); + + FREE(shader->binary.code); + FREE(shader->binary.relocs); } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 6def5c71313..1d7efc23f59 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -34,6 +34,7 @@ #include "si_state.h" struct radeon_shader_binary; +struct radeon_shader_reloc; #define SI_SGPR_RW_BUFFERS 0 /* rings (& stream-out, VS only) */ #define SI_SGPR_CONST 2 @@ -186,7 +187,10 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader); unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index); int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader, const struct radeon_shader_binary *binary); -void si_shader_binary_read_config(const struct radeon_shader_binary *binary, +void si_shader_apply_scratch_relocs(struct si_context *sctx, + struct si_shader *shader, + uint64_t scratch_va); +void si_shader_binary_read_config(const struct si_screen *sscreen, struct si_shader *shader, unsigned symbol_offset); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index cd4880bfd2d..9446eca09c8 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -571,6 +571,20 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) if (sctx->b.flags) sctx->atoms.s.cache_flush->dirty = true; + if (sctx->emit_scratch_reloc) { + struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + r600_write_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, + sctx->spi_tmpring_size); + + if (sctx->scratch_buffer) { + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, + sctx->scratch_buffer, RADEON_USAGE_READWRITE, + RADEON_PRIO_SHADER_RESOURCE_RW); + + } + sctx->emit_scratch_reloc = false; + } + si_need_cs_space(sctx, 0, TRUE); /* Emit states. */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 3249bcc5aa6..a40926800dd 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -67,7 +67,8 @@ static void si_shader_es(struct si_shader *shader) S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B328_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES, - S_00B32C_USER_SGPR(num_user_sgprs)); + S_00B32C_USER_SGPR(num_user_sgprs) | + S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); } static void si_shader_gs(struct si_shader *shader) @@ -136,7 +137,8 @@ static void si_shader_gs(struct si_shader *shader) S_00B228_SGPRS((num_sgprs - 1) / 8) | S_00B228_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, - S_00B22C_USER_SGPR(num_user_sgprs)); + S_00B22C_USER_SGPR(num_user_sgprs) | + S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); } static void si_shader_vs(struct si_shader *shader) @@ -216,7 +218,8 @@ static void si_shader_vs(struct si_shader *shader) S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) | S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) | S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) | - S_00B12C_SO_EN(!!shader->selector->so.num_outputs)); + S_00B12C_SO_EN(!!shader->selector->so.num_outputs) | + S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); if (window_space) si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL, S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1)); @@ -311,7 +314,8 @@ static void si_shader_ps(struct si_shader *shader) S_00B028_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS, S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) | - S_00B02C_USER_SGPR(num_user_sgprs)); + S_00B02C_USER_SGPR(num_user_sgprs) | + S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); } static void si_shader_init_pm4_state(struct si_shader *shader) @@ -710,6 +714,119 @@ static void si_init_gs_rings(struct si_context *sctx) false, false, 0, 0); } +/** + * @returns 1 if \p sel has been updated to use a new scratch buffer and 0 + * otherwise. + */ +static unsigned si_update_scratch_buffer(struct si_context *sctx, + struct si_shader_selector *sel) +{ + struct si_shader *shader; + uint64_t scratch_va = sctx->scratch_buffer->gpu_address; + unsigned char *ptr; + + if (!sel) + return 0; + + shader = sel->current; + + /* This shader doesn't need a scratch buffer */ + if (shader->scratch_bytes_per_wave == 0) + return 0; + + /* This shader is already configured to use the current + * scratch buffer. */ + if (shader->scratch_bo == sctx->scratch_buffer) + return 0; + + assert(sctx->scratch_buffer); + + si_shader_apply_scratch_relocs(sctx, shader, scratch_va); + + /* Replace the shader bo with a new bo that has the relocs applied. */ + r600_resource_reference(&shader->bo, NULL); + shader->bo = si_resource_create_custom(&sctx->screen->b.b, PIPE_USAGE_IMMUTABLE, + shader->binary.code_size); + ptr = sctx->screen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_WRITE); + util_memcpy_cpu_to_le32(ptr, shader->binary.code, shader->binary.code_size); + sctx->screen->b.ws->buffer_unmap(shader->bo->cs_buf); + + /* Update the shader state to use the new shader bo. */ + si_shader_init_pm4_state(shader); + + r600_resource_reference(&shader->scratch_bo, sctx->scratch_buffer); + + return 1; +} + +static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx) +{ + if (!sctx->scratch_buffer) + return 0; + + return sctx->scratch_buffer->b.b.width0; +} + +static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_context *sctx, + struct si_shader_selector *sel) +{ + if (!sel) + return 0; + + return sel->current->scratch_bytes_per_wave; +} + +static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx) +{ + + return MAX3(si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader), + si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader), + si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader)); +} + +static void si_update_spi_tmpring_size(struct si_context *sctx) +{ + unsigned current_scratch_buffer_size = + si_get_current_scratch_buffer_size(sctx); + unsigned scratch_bytes_per_wave = + si_get_max_scratch_bytes_per_wave(sctx); + unsigned scratch_needed_size = scratch_bytes_per_wave * + sctx->scratch_waves; + + if (scratch_needed_size > 0) { + + if (scratch_needed_size > current_scratch_buffer_size) { + /* Create a bigger scratch buffer */ + pipe_resource_reference( + (struct pipe_resource**)&sctx->scratch_buffer, + NULL); + + sctx->scratch_buffer = + si_resource_create_custom(&sctx->screen->b.b, + PIPE_USAGE_DEFAULT, scratch_needed_size); + } + + /* Update the shaders, so they are using the latest scratch. The + * scratch buffer may have been changed since these shaders were + * last used, so we still need to try to update them, even if + * they require scratch buffers smaller than the current size. + */ + if (si_update_scratch_buffer(sctx, sctx->ps_shader)) + si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4); + if (si_update_scratch_buffer(sctx, sctx->gs_shader)) + si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4); + if (si_update_scratch_buffer(sctx, sctx->vs_shader)) + si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4); + } + + /* The LLVM shader backend should be reporting aligned scratch_sizes. */ + assert((scratch_needed_size & ~0x3FF) == scratch_needed_size && + "scratch size should already be aligned correctly."); + + sctx->spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) | + S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10); +} + void si_update_shaders(struct si_context *sctx) { struct pipe_context *ctx = (struct pipe_context*)sctx; @@ -786,6 +903,11 @@ void si_update_shaders(struct si_context *sctx) si_update_spi_map(sctx); } + if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) || + si_pm4_state_changed(sctx, gs)) { + si_update_spi_tmpring_size(sctx); + } + if (sctx->ps_db_shader_control != sctx->ps_shader->current->db_shader_control) { sctx->ps_db_shader_control = sctx->ps_shader->current->db_shader_control; sctx->db_render_state.dirty = true; -- 2.30.2