#define NUM_USER_SGPRS 4
#endif
-static const char *scratch_rsrc_dword0_symbol =
- "SCRATCH_RSRC_DWORD0";
-
-static const char *scratch_rsrc_dword1_symbol =
- "SCRATCH_RSRC_DWORD1";
-
struct si_compute {
struct si_context *ctx;
#endif
};
-static void apply_scratch_relocs(const struct si_screen *sscreen,
- struct si_shader *shader, uint64_t scratch_va);
static void init_scratch_buffer(struct si_context *sctx, struct si_compute *program)
{
unsigned scratch_bytes = 0;
program->shader.binary.global_symbol_offsets[i];
unsigned scratch_bytes_needed;
- si_shader_binary_read_config(&program->shader.binary,
+ si_shader_binary_read_config(sctx->screen,
&program->shader, offset);
scratch_bytes_needed = program->shader.scratch_bytes_per_wave;
scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
program->shader.scratch_bytes_per_wave = scratch_bytes;
/* Patch the shader with the scratch buffer address. */
- apply_scratch_relocs(sctx->screen, &program->shader, scratch_buffer_va);
-
+ si_shader_apply_scratch_relocs(sctx,
+ &program->shader, scratch_buffer_va);
}
static void *si_create_compute_state(
return scratch_waves;
}
-static void apply_scratch_relocs(const struct si_screen *sscreen,
- struct si_shader *shader, uint64_t scratch_va) {
- unsigned i;
- uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
- uint32_t scratch_rsrc_dword1 =
- S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
- | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
-
- if (!shader->binary.reloc_count) {
- return;
- }
-
- for (i = 0 ; i < shader->binary.reloc_count; i++) {
- const struct radeon_shader_reloc *reloc = &shader->binary.relocs[i];
- if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
- util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
- &scratch_rsrc_dword0, 4);
- } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
- util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
- &scratch_rsrc_dword1, 4);
- }
- }
-}
-
static void si_launch_grid(
struct pipe_context *ctx,
const uint *block_layout, const uint *grid_layout,
#if HAVE_LLVM >= 0x0306
/* Read the config information */
- si_shader_binary_read_config(&program->shader.binary, shader, pc);
+ si_shader_binary_read_config(sctx->screen, shader, pc);
#endif
/* Upload the kernel arguments */
LLVMContextDispose(program->llvm_ctx);
}
#else
+ FREE(program->shader.binary.config);
+ FREE(program->shader.binary.rodata);
+ FREE(program->shader.binary.global_symbol_offsets);
si_shader_destroy(ctx, &program->shader);
#endif
pipe_resource_reference(
(struct pipe_resource **)&program->input_buffer, NULL);
- radeon_shader_binary_free_members(&program->shader.binary, true);
FREE(program);
}
ctx->last_prim = -1;
ctx->last_multi_vgt_param = -1;
ctx->last_rast_prim = -1;
+ ctx->emit_scratch_reloc = true;
}
pipe_resource_reference(&sctx->gsvs_ring, NULL);
pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
r600_resource_reference(&sctx->border_color_table, NULL);
+ r600_resource_reference(&sctx->scratch_buffer, NULL);
si_pm4_free_state(sctx, sctx->init_config, ~0);
si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings);
sctx->null_const_buf.buffer->width0, 0, false);
}
+ /* XXX: This is the maximum value allowed. I'm not sure how to compute
+ * this for non-cs shaders. Using the wrong value here can result in
+ * GPU lockups, but the maximum value seems to always work.
+ */
+ sctx->scratch_waves = 32 * sscreen->b.info.max_compute_units;
+
return &sctx->b.b;
fail:
si_destroy_context(&sctx->b.b);
r600_target = radeon_llvm_get_r600_target(triple);
sscreen->tm = LLVMCreateTargetMachine(r600_target, triple,
r600_get_llvm_processor_name(sscreen->b.family),
- "+DumpCode", LLVMCodeGenLevelDefault, LLVMRelocDefault,
+ "+DumpCode,+vgpr-spilling", LLVMCodeGenLevelDefault, LLVMRelocDefault,
LLVMCodeModelDefault);
#endif
return &sscreen->b.b;
struct si_buffer_resources const_buffers[SI_NUM_SHADERS];
struct si_buffer_resources rw_buffers[SI_NUM_SHADERS];
struct si_textures_info samplers[SI_NUM_SHADERS];
+ struct r600_resource *scratch_buffer;
struct r600_resource *border_color_table;
unsigned border_color_offset;
int last_prim;
int last_multi_vgt_param;
int last_rast_prim;
+
+ /* Scratch buffer */
+ boolean emit_scratch_reloc;
+ unsigned scratch_waves;
+ unsigned spi_tmpring_size;
};
/* si_blit.c */
#include "gallivm/lp_bld_logic.h"
#include "gallivm/lp_bld_arit.h"
#include "gallivm/lp_bld_flow.h"
+#include "radeon/r600_cs.h"
#include "radeon/radeon_llvm.h"
#include "radeon/radeon_elf_util.h"
#include "radeon/radeon_llvm_emit.h"
#include <errno.h>
+static const char *scratch_rsrc_dword0_symbol =
+ "SCRATCH_RSRC_DWORD0";
+
+static const char *scratch_rsrc_dword1_symbol =
+ "SCRATCH_RSRC_DWORD1";
+
struct si_shader_output_values
{
LLVMValueRef values[4];
}
}
-void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
+void si_shader_binary_read_config(const struct si_screen *sscreen,
struct si_shader *shader,
unsigned symbol_offset)
{
unsigned i;
const unsigned char *config =
- radeon_shader_binary_config_start(binary, symbol_offset);
+ radeon_shader_binary_config_start(&shader->binary,
+ symbol_offset);
/* XXX: We may be able to emit some of these values directly rather than
* extracting fields to be emitted later.
*/
- for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
+ for (i = 0; i < shader->binary.config_size_per_symbol; i+= 8) {
unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
switch (reg) {
case R_0286CC_SPI_PS_INPUT_ENA:
shader->spi_ps_input_ena = value;
break;
+ case R_0286E8_SPI_TMPRING_SIZE:
case R_00B860_COMPUTE_TMPRING_SIZE:
/* WAVESIZE is in units of 256 dwords. */
shader->scratch_bytes_per_wave =
}
}
+void si_shader_apply_scratch_relocs(struct si_context *sctx,
+ struct si_shader *shader,
+ uint64_t scratch_va)
+{
+ unsigned i;
+ uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
+ uint32_t scratch_rsrc_dword1 =
+ S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
+ | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
+
+ for (i = 0 ; i < shader->binary.reloc_count; i++) {
+ const struct radeon_shader_reloc *reloc =
+ &shader->binary.relocs[i];
+ if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
+ util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
+ &scratch_rsrc_dword0, 4);
+ } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
+ util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
+ &scratch_rsrc_dword1, 4);
+ }
+ }
+}
+
int si_shader_binary_read(struct si_screen *sscreen,
struct si_shader *shader,
const struct radeon_shader_binary *binary)
}
}
- si_shader_binary_read_config(binary, shader, 0);
+ si_shader_binary_read_config(sscreen, shader, 0);
/* copy new shader */
code_size = binary->code_size + binary->rodata_size;
LLVMModuleRef mod)
{
int r = 0;
- struct radeon_shader_binary binary;
bool dump = r600_can_dump_shader(&sscreen->b,
shader->selector ? shader->selector->tokens : NULL);
- memset(&binary, 0, sizeof(binary));
- r = radeon_llvm_compile(mod, &binary,
+ r = radeon_llvm_compile(mod, &shader->binary,
r600_get_llvm_processor_name(sscreen->b.family), dump, sscreen->tm);
if (r) {
return r;
}
- r = si_shader_binary_read(sscreen, shader, &binary);
- radeon_shader_binary_free_members(&binary, true);
+ r = si_shader_binary_read(sscreen, shader, &shader->binary);
+
+ FREE(shader->binary.config);
+ FREE(shader->binary.rodata);
+ FREE(shader->binary.global_symbol_offsets);
+ if (shader->scratch_bytes_per_wave == 0) {
+ FREE(shader->binary.code);
+ FREE(shader->binary.relocs);
+ memset(&shader->binary, 0, sizeof(shader->binary));
+ }
return r;
}
r600_resource_reference(&shader->scratch_bo, NULL);
r600_resource_reference(&shader->bo, NULL);
+
+ FREE(shader->binary.code);
+ FREE(shader->binary.relocs);
}
#include "si_state.h"
struct radeon_shader_binary;
+struct radeon_shader_reloc;
#define SI_SGPR_RW_BUFFERS 0 /* rings (& stream-out, VS only) */
#define SI_SGPR_CONST 2
unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
const struct radeon_shader_binary *binary);
-void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
+void si_shader_apply_scratch_relocs(struct si_context *sctx,
+ struct si_shader *shader,
+ uint64_t scratch_va);
+void si_shader_binary_read_config(const struct si_screen *sscreen,
struct si_shader *shader,
unsigned symbol_offset);
if (sctx->b.flags)
sctx->atoms.s.cache_flush->dirty = true;
+ if (sctx->emit_scratch_reloc) {
+ struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ r600_write_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
+ sctx->spi_tmpring_size);
+
+ if (sctx->scratch_buffer) {
+ r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+ sctx->scratch_buffer, RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SHADER_RESOURCE_RW);
+
+ }
+ sctx->emit_scratch_reloc = false;
+ }
+
si_need_cs_space(sctx, 0, TRUE);
/* Emit states. */
S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
- S_00B32C_USER_SGPR(num_user_sgprs));
+ S_00B32C_USER_SGPR(num_user_sgprs) |
+ S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
}
static void si_shader_gs(struct si_shader *shader)
S_00B228_SGPRS((num_sgprs - 1) / 8) |
S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
- S_00B22C_USER_SGPR(num_user_sgprs));
+ S_00B22C_USER_SGPR(num_user_sgprs) |
+ S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
}
static void si_shader_vs(struct si_shader *shader)
S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
- S_00B12C_SO_EN(!!shader->selector->so.num_outputs));
+ S_00B12C_SO_EN(!!shader->selector->so.num_outputs) |
+ S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
if (window_space)
si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL,
S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1));
S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) |
- S_00B02C_USER_SGPR(num_user_sgprs));
+ S_00B02C_USER_SGPR(num_user_sgprs) |
+ S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
}
static void si_shader_init_pm4_state(struct si_shader *shader)
false, false, 0, 0);
}
+/**
+ * @returns 1 if \p sel has been updated to use a new scratch buffer and 0
+ * otherwise.
+ */
+static unsigned si_update_scratch_buffer(struct si_context *sctx,
+ struct si_shader_selector *sel)
+{
+ struct si_shader *shader;
+ uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
+ unsigned char *ptr;
+
+ if (!sel)
+ return 0;
+
+ shader = sel->current;
+
+ /* This shader doesn't need a scratch buffer */
+ if (shader->scratch_bytes_per_wave == 0)
+ return 0;
+
+ /* This shader is already configured to use the current
+ * scratch buffer. */
+ if (shader->scratch_bo == sctx->scratch_buffer)
+ return 0;
+
+ assert(sctx->scratch_buffer);
+
+ si_shader_apply_scratch_relocs(sctx, shader, scratch_va);
+
+ /* Replace the shader bo with a new bo that has the relocs applied. */
+ r600_resource_reference(&shader->bo, NULL);
+ shader->bo = si_resource_create_custom(&sctx->screen->b.b, PIPE_USAGE_IMMUTABLE,
+ shader->binary.code_size);
+ ptr = sctx->screen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_WRITE);
+ util_memcpy_cpu_to_le32(ptr, shader->binary.code, shader->binary.code_size);
+ sctx->screen->b.ws->buffer_unmap(shader->bo->cs_buf);
+
+ /* Update the shader state to use the new shader bo. */
+ si_shader_init_pm4_state(shader);
+
+ r600_resource_reference(&shader->scratch_bo, sctx->scratch_buffer);
+
+ return 1;
+}
+
+static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx)
+{
+ if (!sctx->scratch_buffer)
+ return 0;
+
+ return sctx->scratch_buffer->b.b.width0;
+}
+
+static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_context *sctx,
+ struct si_shader_selector *sel)
+{
+ if (!sel)
+ return 0;
+
+ return sel->current->scratch_bytes_per_wave;
+}
+
+static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
+{
+
+ return MAX3(si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader),
+ si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader),
+ si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader));
+}
+
+static void si_update_spi_tmpring_size(struct si_context *sctx)
+{
+ unsigned current_scratch_buffer_size =
+ si_get_current_scratch_buffer_size(sctx);
+ unsigned scratch_bytes_per_wave =
+ si_get_max_scratch_bytes_per_wave(sctx);
+ unsigned scratch_needed_size = scratch_bytes_per_wave *
+ sctx->scratch_waves;
+
+ if (scratch_needed_size > 0) {
+
+ if (scratch_needed_size > current_scratch_buffer_size) {
+ /* Create a bigger scratch buffer */
+ pipe_resource_reference(
+ (struct pipe_resource**)&sctx->scratch_buffer,
+ NULL);
+
+ sctx->scratch_buffer =
+ si_resource_create_custom(&sctx->screen->b.b,
+ PIPE_USAGE_DEFAULT, scratch_needed_size);
+ }
+
+ /* Update the shaders, so they are using the latest scratch. The
+ * scratch buffer may have been changed since these shaders were
+ * last used, so we still need to try to update them, even if
+ * they require scratch buffers smaller than the current size.
+ */
+ if (si_update_scratch_buffer(sctx, sctx->ps_shader))
+ si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4);
+ if (si_update_scratch_buffer(sctx, sctx->gs_shader))
+ si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
+ if (si_update_scratch_buffer(sctx, sctx->vs_shader))
+ si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
+ }
+
+ /* The LLVM shader backend should be reporting aligned scratch_sizes. */
+ assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
+ "scratch size should already be aligned correctly.");
+
+ sctx->spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
+ S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
+}
+
void si_update_shaders(struct si_context *sctx)
{
struct pipe_context *ctx = (struct pipe_context*)sctx;
si_update_spi_map(sctx);
}
+ if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) ||
+ si_pm4_state_changed(sctx, gs)) {
+ si_update_spi_tmpring_size(sctx);
+ }
+
if (sctx->ps_db_shader_control != sctx->ps_shader->current->db_shader_control) {
sctx->ps_db_shader_control = sctx->ps_shader->current->db_shader_control;
sctx->db_render_state.dirty = true;