unsigned scratch_bytes_needed;
si_shader_binary_read_config(&program->shader, offset);
- scratch_bytes_needed = program->shader.scratch_bytes_per_wave;
+ scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave;
scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
}
* to the maximum bytes needed, so it can compute the stride
* correctly.
*/
- program->shader.scratch_bytes_per_wave = scratch_bytes;
+ program->shader.config.scratch_bytes_per_wave = scratch_bytes;
/* Patch the shader with the scratch buffer address. */
si_shader_apply_scratch_relocs(sctx,
memcpy(kernel_args + (num_work_size_bytes / 4), input, program->input_size);
- if (shader->scratch_bytes_per_wave > 0) {
+ if (shader->config.scratch_bytes_per_wave > 0) {
COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
"Total Scratch: %u bytes\n", num_waves_for_scratch,
- shader->scratch_bytes_per_wave,
- shader->scratch_bytes_per_wave *
+ shader->config.scratch_bytes_per_wave,
+ shader->config.scratch_bytes_per_wave *
num_waves_for_scratch);
radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 8, scratch_buffer_va);
si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 12,
S_008F04_BASE_ADDRESS_HI(scratch_buffer_va >> 32)
- | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64));
+ | S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64));
si_pm4_set_reg(pm4, R_00B810_COMPUTE_START_X, 0);
si_pm4_set_reg(pm4, R_00B814_COMPUTE_START_Y, 0);
si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
- si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1);
+ si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->config.rsrc1);
- lds_blocks = shader->lds_size;
+ lds_blocks = shader->config.lds_size;
/* XXX: We are over allocating LDS. For SI, the shader reports LDS in
* blocks of 256 bytes, so if there are 4 bytes lds allocated in
* the shader and 4 bytes allocated by the state tracker, then
assert(lds_blocks <= 0xFF);
- shader->rsrc2 &= C_00B84C_LDS_SIZE;
- shader->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks);
+ shader->config.rsrc2 &= C_00B84C_LDS_SIZE;
+ shader->config.rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks);
- si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2);
+ si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->config.rsrc2);
si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0);
si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0,
* COMPUTE_PGM_RSRC2.SCRATCH_EN is enabled.
*/
S_00B860_WAVES(num_waves_for_scratch)
- | S_00B860_WAVESIZE(shader->scratch_bytes_per_wave >> 10))
+ | S_00B860_WAVESIZE(shader->config.scratch_bytes_per_wave >> 10))
;
si_pm4_cmd_begin(pm4, PKT3_DISPATCH_DIRECT);
case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
case R_00B848_COMPUTE_PGM_RSRC1:
- shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
- shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
- shader->float_mode = G_00B028_FLOAT_MODE(value);
- shader->rsrc1 = value;
+ shader->config.num_sgprs = MAX2(shader->config.num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
+ shader->config.num_vgprs = MAX2(shader->config.num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
+ shader->config.float_mode = G_00B028_FLOAT_MODE(value);
+ shader->config.rsrc1 = value;
break;
case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
- shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
+ shader->config.lds_size = MAX2(shader->config.lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
break;
case R_00B84C_COMPUTE_PGM_RSRC2:
- shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
- shader->rsrc2 = value;
+ shader->config.lds_size = MAX2(shader->config.lds_size, G_00B84C_LDS_SIZE(value));
+ shader->config.rsrc2 = value;
break;
case R_0286CC_SPI_PS_INPUT_ENA:
- shader->spi_ps_input_ena = value;
+ shader->config.spi_ps_input_ena = value;
break;
case R_0286E8_SPI_TMPRING_SIZE:
case R_00B860_COMPUTE_TMPRING_SIZE:
/* WAVESIZE is in units of 256 dwords. */
- shader->scratch_bytes_per_wave =
+ shader->config.scratch_bytes_per_wave =
G_00B860_WAVESIZE(value) * 256 * 4 * 1;
break;
default:
uint32_t scratch_rsrc_dword0 = scratch_va;
uint32_t scratch_rsrc_dword1 =
S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
- | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
+ | S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64);
for (i = 0 ; i < shader->binary.reloc_count; i++) {
const struct radeon_shader_reloc *reloc =
fprintf(stderr, "*** SHADER STATS ***\n"
"SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
"Scratch: %d bytes per wave\n********************\n",
- shader->num_sgprs, shader->num_vgprs, binary->code_size,
- shader->lds_size, shader->scratch_bytes_per_wave);
+ shader->config.num_sgprs, shader->config.num_vgprs, binary->code_size,
+ shader->config.lds_size, shader->config.scratch_bytes_per_wave);
}
pipe_debug_message(debug, SHADER_INFO,
"Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
- shader->num_sgprs, shader->num_vgprs, binary->code_size,
- shader->lds_size, shader->scratch_bytes_per_wave);
+ shader->config.num_sgprs, shader->config.num_vgprs,
+ binary->code_size, shader->config.lds_size,
+ shader->config.scratch_bytes_per_wave);
}
int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
FREE(shader->binary.config);
FREE(shader->binary.rodata);
FREE(shader->binary.global_symbol_offsets);
- if (shader->scratch_bytes_per_wave == 0) {
+ if (shader->config.scratch_bytes_per_wave == 0) {
FREE(shader->binary.code);
FREE(shader->binary.relocs);
memset(&shader->binary, 0,
} tes; /* tessellation evaluation shader */
};
+struct si_shader_config {
+ unsigned num_sgprs;
+ unsigned num_vgprs;
+ unsigned lds_size;
+ unsigned spi_ps_input_ena;
+ unsigned float_mode;
+ unsigned scratch_bytes_per_wave;
+ unsigned rsrc1;
+ unsigned rsrc2;
+};
+
struct si_shader {
struct si_shader_selector *selector;
struct si_shader *next_variant;
struct si_pm4_state *pm4;
struct r600_resource *bo;
struct r600_resource *scratch_bo;
- struct radeon_shader_binary binary;
- unsigned num_sgprs;
- unsigned num_vgprs;
- unsigned lds_size;
- unsigned spi_ps_input_ena;
- unsigned float_mode;
- unsigned scratch_bytes_per_wave;
union si_shader_key key;
+ struct radeon_shader_binary binary;
+ struct si_shader_config config;
unsigned nparam;
unsigned vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS];
unsigned nr_param_exports;
bool is_gs_copy_shader;
bool dx10_clamp_mode; /* convert NaNs to 0 */
-
- unsigned rsrc1;
- unsigned rsrc2;
};
static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
lds_size = output_patch0_offset + output_patch_size * *num_patches;
- ls_rsrc2 = ls->current->rsrc2;
+ ls_rsrc2 = ls->current->config.rsrc2;
if (sctx->b.chip_class >= CIK) {
assert(lds_size <= 65536);
if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
- radeon_emit(cs, ls->current->rsrc1);
+ radeon_emit(cs, ls->current->config.rsrc1);
radeon_emit(cs, ls_rsrc2);
/* Compute userdata SGPRs. */
vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1;
num_user_sgprs = SI_LS_NUM_USER_SGPR;
- num_sgprs = shader->num_sgprs;
+ num_sgprs = shader->config.num_sgprs;
if (num_user_sgprs > num_sgprs) {
/* Last 2 reserved SGPRs are used for VCC */
num_sgprs = num_user_sgprs + 2;
si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
- shader->rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
+ shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
S_00B528_SGPRS((num_sgprs - 1) / 8) |
S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) |
S_00B528_DX10_CLAMP(shader->dx10_clamp_mode);
- shader->rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
- S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
+ shader->config.rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
+ S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
}
static void si_shader_hs(struct si_shader *shader)
si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
num_user_sgprs = SI_TCS_NUM_USER_SGPR;
- num_sgprs = shader->num_sgprs;
+ num_sgprs = shader->config.num_sgprs;
/* One SGPR after user SGPRs is pre-loaded with tessellation factor
* buffer offset. */
if ((num_user_sgprs + 1) > num_sgprs) {
si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
- S_00B428_VGPRS((shader->num_vgprs - 1) / 4) |
+ S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) |
S_00B428_SGPRS((num_sgprs - 1) / 8) |
S_00B428_DX10_CLAMP(shader->dx10_clamp_mode));
si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
S_00B42C_USER_SGPR(num_user_sgprs) |
- S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+ S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
}
static void si_shader_es(struct si_shader *shader)
} else
unreachable("invalid shader selector type");
- num_sgprs = shader->num_sgprs;
+ num_sgprs = shader->config.num_sgprs;
/* One SGPR after user SGPRs is pre-loaded with es2gs_offset */
if ((num_user_sgprs + 1) > num_sgprs) {
/* Last 2 reserved SGPRs are used for VCC */
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
- S_00B328_VGPRS((shader->num_vgprs - 1) / 4) |
+ S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
S_00B328_SGPRS((num_sgprs - 1) / 8) |
S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
S_00B32C_USER_SGPR(num_user_sgprs) |
- S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+ S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
si_set_tesseval_regs(shader, pm4);
si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
num_user_sgprs = SI_GS_NUM_USER_SGPR;
- num_sgprs = shader->num_sgprs;
+ num_sgprs = shader->config.num_sgprs;
/* Two SGPRs after user SGPRs are pre-loaded with gs2vs_offset, gs_wave_id */
if ((num_user_sgprs + 2) > num_sgprs) {
/* Last 2 reserved SGPRs are used for VCC */
assert(num_sgprs <= 104);
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
- S_00B228_VGPRS((shader->num_vgprs - 1) / 4) |
+ S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
S_00B228_SGPRS((num_sgprs - 1) / 8) |
S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
S_00B22C_USER_SGPR(num_user_sgprs) |
- S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+ S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
}
static void si_shader_vs(struct si_shader *shader)
} else
unreachable("invalid shader selector type");
- num_sgprs = shader->num_sgprs;
+ num_sgprs = shader->config.num_sgprs;
if (num_user_sgprs > num_sgprs) {
/* Last 2 reserved SGPRs are used for VCC */
num_sgprs = num_user_sgprs + 2;
si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, va >> 40);
si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS,
- S_00B128_VGPRS((shader->num_vgprs - 1) / 4) |
+ S_00B128_VGPRS((shader->config.num_vgprs - 1) / 4) |
S_00B128_SGPRS((num_sgprs - 1) / 8) |
S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
S_00B128_DX10_CLAMP(shader->dx10_clamp_mode));
S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
S_00B12C_SO_EN(!!shader->selector->so.num_outputs) |
- S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+ S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
if (window_space)
si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL,
S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1));
}
/* Set interpolation controls. */
- has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->spi_ps_input_ena) ||
- G_0286CC_LINEAR_CENTROID_ENA(shader->spi_ps_input_ena);
+ has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) ||
+ G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena);
spi_ps_in_control = S_0286D8_NUM_INTERP(shader->nparam) |
S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid);
si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, va >> 40);
num_user_sgprs = SI_PS_NUM_USER_SGPR;
- num_sgprs = shader->num_sgprs;
+ num_sgprs = shader->config.num_sgprs;
/* One SGPR after user SGPRs is pre-loaded with {prim_mask, lds_offset} */
if ((num_user_sgprs + 1) > num_sgprs) {
/* Last 2 reserved SGPRs are used for VCC */
assert(num_sgprs <= 104);
si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
- S_00B028_VGPRS((shader->num_vgprs - 1) / 4) |
+ S_00B028_VGPRS((shader->config.num_vgprs - 1) / 4) |
S_00B028_SGPRS((num_sgprs - 1) / 8) |
S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
- S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) |
+ S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
S_00B02C_USER_SGPR(num_user_sgprs) |
- S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+ S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
}
static void si_shader_init_pm4_state(struct si_shader *shader)
if (!ps)
return;
- input_ena = ps->spi_ps_input_ena;
+ input_ena = ps->config.spi_ps_input_ena;
/* we need to enable at least one of them, otherwise we hang the GPU */
assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
return 0;
/* This shader doesn't need a scratch buffer */
- if (shader->scratch_bytes_per_wave == 0)
+ if (shader->config.scratch_bytes_per_wave == 0)
return 0;
/* This shader is already configured to use the current
static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
{
- return shader ? shader->scratch_bytes_per_wave : 0;
+ return shader ? shader->config.scratch_bytes_per_wave : 0;
}
static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)