LLVMBuilderRef builder = ctx->ac.builder;
LLVMValueRef tmp;
tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
- LLVMConstInt(ctx->ac.i32, 64, false), "");
+ LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
}
LLVMValueRef numprims =
LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
- numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, 64);
+ numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->i32_0, "");
ac_build_ifcc(&ctx->ac, tmp, 5105);
/* Round up towards full wave sizes for better ALU utilization. */
if (!max_vert_out_per_gs_instance) {
- const unsigned wavesize = 64;
+ const unsigned wavesize = gs_sel->screen->ge_wave_size;
unsigned orig_max_esverts;
unsigned orig_max_gsprims;
do {
if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
.info = &sel->screen->info,
.shader_type = MESA_SHADER_COMPUTE,
- .wave_size = 64,
+ .wave_size = sel->screen->compute_wave_size,
.num_parts = 1,
.elf_ptrs = &program->shader.binary.elf_buffer,
.elf_sizes = &program->shader.binary.elf_size }))
si_nir_opts(sel->nir);
si_nir_scan_shader(sel->nir, &sel->info);
- si_lower_nir(sel);
+ si_lower_nir(sel, sscreen->compute_wave_size);
}
/* Store the declared LDS size into tgsi_shader_info for the shader
program->num_cs_user_data_dwords;
shader->config.rsrc1 =
- S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
+ S_00B848_VGPRS((shader->config.num_vgprs - 1) /
+ (sscreen->compute_wave_size == 32 ? 8 : 4)) |
S_00B848_DX10_CLAMP(1) |
S_00B848_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
S_00B848_WGP_MODE(sscreen->info.chip_class >= GFX10) |
unsigned threads_per_threadgroup =
info->block[0] * info->block[1] * info->block[2];
unsigned waves_per_threadgroup =
- DIV_ROUND_UP(threads_per_threadgroup, 64);
+ DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size);
unsigned threadgroups_per_cu = 1;
if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1)
S_00B800_FORCE_START_AT_000(1) |
/* If the KMD allows it (there is a KMD hw register for it),
* allow launching waves out-of-order. (same as Vulkan) */
- S_00B800_ORDER_MODE(sctx->chip_class >= GFX7);
+ S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) |
+ S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32);
const uint *last_block = info->last_block;
bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
SI_COMPUTE_CLEAR_DW_PER_THREAD;
unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
- unsigned dwords_per_wave = dwords_per_thread * 64;
+ unsigned wave_size = sctx->screen->compute_wave_size;
+ unsigned dwords_per_wave = dwords_per_thread * wave_size;
unsigned num_dwords = size / 4;
unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
struct pipe_grid_info info = {};
- info.block[0] = MIN2(64, num_instructions);
+ info.block[0] = MIN2(wave_size, num_instructions);
info.block[1] = 1;
info.block[2] = 1;
info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
uint64_t *addr,
unsigned *num,
struct si_shader_inst *instructions,
- enum pipe_shader_type shader_type)
+ enum pipe_shader_type shader_type,
+ unsigned wave_size)
{
if (!ac_rtld_open(rtld_binary, (struct ac_rtld_open_info){
.info = &screen->info,
.shader_type = tgsi_processor_to_shader_stage(shader_type),
- .wave_size = 64,
+ .wave_size = wave_size,
.num_parts = 1,
.elf_ptrs = &binary->elf_buffer,
.elf_sizes = &binary->elf_size }))
*/
unsigned num_inst = 0;
uint64_t inst_addr = start_addr;
+ unsigned wave_size = si_get_shader_wave_size(shader);
struct ac_rtld_binary rtld_binaries[5] = {};
struct si_shader_inst *instructions =
calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst));
if (shader->prolog) {
si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary,
- &inst_addr, &num_inst, instructions, shader_type);
+ &inst_addr, &num_inst, instructions, shader_type, wave_size);
}
if (shader->previous_stage) {
si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary,
- &inst_addr, &num_inst, instructions, shader_type);
+ &inst_addr, &num_inst, instructions, shader_type, wave_size);
}
if (shader->prolog2) {
si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary,
- &inst_addr, &num_inst, instructions, shader_type);
+ &inst_addr, &num_inst, instructions, shader_type, wave_size);
}
si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary,
- &inst_addr, &num_inst, instructions, shader_type);
+ &inst_addr, &num_inst, instructions, shader_type, wave_size);
if (shader->epilog) {
si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary,
- &inst_addr, &num_inst, instructions, shader_type);
+ &inst_addr, &num_inst, instructions, shader_type, wave_size);
}
fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n",
case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
if (ret) {
uint32_t *subgroup_size = ret;
- *subgroup_size = 64;
+ *subgroup_size = sscreen->compute_wave_size;
}
return sizeof(uint32_t);
case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options);
compiler->passes = ac_create_llvm_passes(compiler->tm);
+ if (compiler->tm_wave32)
+ compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32);
if (compiler->low_opt_tm)
compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
}
for (i = 0; i < num_comp_lo_threads; i++)
si_init_compiler(sscreen, &sscreen->compiler_lowp[i]);
+ sscreen->ge_wave_size = 64;
+ sscreen->ps_wave_size = 64;
+ sscreen->compute_wave_size = 64;
+
/* Create the auxiliary context. This must be done last. */
sscreen->aux_context = si_create_context(
&sscreen->b, sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0);
/* Use at most 2 low priority threads on quadcore and better.
* We want to minimize the impact on multithreaded Mesa. */
struct ac_llvm_compiler compiler_lowp[10];
+
+ unsigned compute_wave_size;
+ unsigned ps_wave_size;
+ unsigned ge_wave_size;
};
struct si_blend_color {
return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
}
+static inline unsigned si_get_wave_size(struct si_screen *sscreen,
+ enum pipe_shader_type shader_type,
+ bool ngg)
+{
+ if (shader_type == PIPE_SHADER_COMPUTE)
+ return sscreen->compute_wave_size;
+ else if (shader_type == PIPE_SHADER_FRAGMENT)
+ return sscreen->ps_wave_size;
+ else if (shader_type == PIPE_SHADER_GEOMETRY && !ngg) /* legacy GS only supports Wave64 */
+ return 64;
+ else
+ return sscreen->ge_wave_size;
+}
+
+static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
+{
+ return si_get_wave_size(shader->selector->screen, shader->selector->type,
+ shader->key.as_ngg);
+}
+
#define PRINT_ERR(fmt, args...) \
fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
static void si_init_shader_ctx(struct si_shader_context *ctx,
struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler);
+ struct ac_llvm_compiler *compiler,
+ unsigned wave_size);
static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
struct lp_build_tgsi_context *bld_base,
break;
case TGSI_SEMANTIC_SUBGROUP_SIZE:
- value = LLVMConstInt(ctx->i32, 64, 0);
+ value = LLVMConstInt(ctx->i32, ctx->ac.wave_size, 0);
break;
case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
LLVMBuildMul(ctx->ac.builder, wave_idx,
- LLVMConstInt(ctx->i32, 64, false), ""), "");
+ LLVMConstInt(ctx->i32, ctx->ac.wave_size, false), ""), "");
lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
}
/* Limit on the stride field for <= GFX7. */
assert(stride < (1 << 14));
- num_records = 64;
+ num_records = ctx->ac.wave_size;
ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
tmp = LLVMBuildAdd(builder, tmp,
LLVMConstInt(ctx->i64,
stream_offset, 0), "");
- stream_offset += stride * 64;
+ stream_offset += stride * ctx->ac.wave_size;
ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
.halt_at_entry = screen->options.halt_shaders,
},
.shader_type = tgsi_processor_to_shader_stage(sel->type),
- .wave_size = 64,
+ .wave_size = si_get_shader_wave_size(shader),
.num_parts = num_parts,
.elf_ptrs = part_elfs,
.elf_sizes = part_sizes,
static void si_shader_dump_disassembly(struct si_screen *screen,
const struct si_shader_binary *binary,
enum pipe_shader_type shader_type,
+ unsigned wave_size,
struct pipe_debug_callback *debug,
const char *name, FILE *file)
{
if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
.info = &screen->info,
.shader_type = tgsi_processor_to_shader_stage(shader_type),
- .wave_size = 64,
+ .wave_size = wave_size,
.num_parts = 1,
.elf_ptrs = &binary->elf_buffer,
.elf_sizes = &binary->elf_size }))
unsigned max_workgroup_size =
si_get_max_workgroup_size(shader);
lds_per_wave = (conf->lds_size * lds_increment) /
- DIV_ROUND_UP(max_workgroup_size, 64);
+ DIV_ROUND_UP(max_workgroup_size,
+ sscreen->compute_wave_size);
}
break;
default:;
if (screen->options.debug_disassembly)
si_shader_dump_disassembly(screen, &shader->binary,
shader->selector->type,
+ si_get_shader_wave_size(shader),
debug, "main", NULL);
pipe_debug_message(debug, SHADER_INFO,
if (!check_debug_option ||
(si_can_dump_shader(sscreen, shader_type) &&
!(sscreen->debug_flags & DBG(NO_ASM)))) {
+ unsigned wave_size = si_get_shader_wave_size(shader);
+
fprintf(file, "\n%s:\n", si_get_shader_name(shader));
if (shader->prolog)
si_shader_dump_disassembly(sscreen, &shader->prolog->binary,
- shader_type, debug, "prolog", file);
+ shader_type, wave_size, debug, "prolog", file);
if (shader->previous_stage)
si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary,
- shader_type, debug, "previous stage", file);
+ shader_type, wave_size, debug, "previous stage", file);
if (shader->prolog2)
si_shader_dump_disassembly(sscreen, &shader->prolog2->binary,
- shader_type, debug, "prolog2", file);
+ shader_type, wave_size, debug, "prolog2", file);
- si_shader_dump_disassembly(sscreen, &shader->binary, shader_type, debug, "main", file);
+ si_shader_dump_disassembly(sscreen, &shader->binary, shader_type,
+ wave_size, debug, "main", file);
if (shader->epilog)
si_shader_dump_disassembly(sscreen, &shader->epilog->binary,
- shader_type, debug, "epilog", file);
+ shader_type, wave_size, debug, "epilog", file);
fprintf(file, "\n");
}
LLVMModuleRef mod,
struct pipe_debug_callback *debug,
enum pipe_shader_type shader_type,
+ unsigned wave_size,
const char *name,
bool less_optimized)
{
if (!si_replace_shader(count, binary)) {
unsigned r = si_llvm_compile(mod, binary, compiler, debug,
- less_optimized);
+ less_optimized, wave_size);
if (r)
return r;
}
if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
.info = &sscreen->info,
.shader_type = tgsi_processor_to_shader_stage(shader_type),
- .wave_size = 64,
+ .wave_size = wave_size,
.num_parts = 1,
.elf_ptrs = &binary->elf_buffer,
.elf_sizes = &binary->elf_size }))
shader->selector = gs_selector;
shader->is_gs_copy_shader = true;
- si_init_shader_ctx(&ctx, sscreen, compiler);
+ si_init_shader_ctx(&ctx, sscreen, compiler,
+ si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false));
ctx.shader = shader;
ctx.type = PIPE_SHADER_VERTEX;
if (si_compile_llvm(sscreen, &ctx.shader->binary,
&ctx.shader->config, ctx.compiler,
ctx.ac.module,
- debug, PIPE_SHADER_GEOMETRY,
+ debug, PIPE_SHADER_GEOMETRY, ctx.ac.wave_size,
"GS Copy Shader", false) == 0) {
if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
fprintf(stderr, "GS Copy Shader:\n");
static void si_init_shader_ctx(struct si_shader_context *ctx,
struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler)
+ struct ac_llvm_compiler *compiler,
+ unsigned wave_size)
{
struct lp_build_tgsi_context *bld_base;
- si_llvm_context_init(ctx, sscreen, compiler);
+ si_llvm_context_init(ctx, sscreen, compiler, wave_size);
bld_base = &ctx->bld_base;
bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
si_dump_streamout(&sel->so);
}
- si_init_shader_ctx(&ctx, sscreen, compiler);
+ si_init_shader_ctx(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
si_llvm_context_set_tgsi(&ctx, shader);
memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
/* Compile to bytecode. */
r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
- ctx.ac.module, debug, ctx.type,
+ ctx.ac.module, debug, ctx.type, ctx.ac.wave_size,
si_get_shader_name(shader),
si_should_optimize_less(compiler, shader->selector));
si_llvm_dispose(&ctx);
* LLVM 3.9svn has this bug.
*/
if (sel->type == PIPE_SHADER_COMPUTE) {
- unsigned wave_size = 64;
+ unsigned wave_size = sscreen->compute_wave_size;
unsigned max_vgprs = 256;
unsigned max_sgprs = sscreen->info.chip_class >= GFX8 ? 800 : 512;
unsigned max_sgprs_per_wave = 128;
}
struct si_shader_context ctx;
- si_init_shader_ctx(&ctx, sscreen, compiler);
+ si_init_shader_ctx(&ctx, sscreen, compiler,
+ si_get_wave_size(sscreen, type, shader.key.as_ngg));
ctx.shader = &shader;
ctx.type = type;
si_llvm_optimize_module(&ctx);
if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
- ctx.ac.module, debug, ctx.type, name, false)) {
+ ctx.ac.module, debug, ctx.type, ctx.ac.wave_size,
+ name, false)) {
FREE(result);
result = NULL;
goto out;
shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
if (shader->selector->type == PIPE_SHADER_COMPUTE &&
- si_get_max_workgroup_size(shader) > 64) {
+ si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
si_multiwave_lds_size_workaround(sscreen,
&shader->config.lds_size);
}
struct tgsi_shader_info *info);
void si_nir_scan_tess_ctrl(const struct nir_shader *nir,
struct tgsi_tessctrl_info *out);
-void si_lower_nir(struct si_shader_selector *sel);
+void si_lower_nir(struct si_shader_selector *sel, unsigned wave_size);
void si_nir_opts(struct nir_shader *nir);
/* si_state_shaders.c */
unsigned si_llvm_compile(LLVMModuleRef M, struct si_shader_binary *binary,
struct ac_llvm_compiler *compiler,
struct pipe_debug_callback *debug,
- bool less_optimized);
+ bool less_optimized, unsigned wave_size);
LLVMTypeRef tgsi2llvmtype(struct lp_build_tgsi_context *bld_base,
enum tgsi_opcode_type type);
void si_llvm_context_init(struct si_shader_context *ctx,
struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler);
+ struct ac_llvm_compiler *compiler,
+ unsigned wave_size);
void si_llvm_context_set_tgsi(struct si_shader_context *ctx,
struct si_shader *shader);
* selector is created.
*/
void
-si_lower_nir(struct si_shader_selector* sel)
+si_lower_nir(struct si_shader_selector* sel, unsigned wave_size)
{
/* Adjust the driver location of inputs and outputs. The state tracker
* interprets them as slots, while the ac/nir backend interprets them
NIR_PASS_V(sel->nir, nir_lower_tex, &lower_tex_options);
const nir_lower_subgroups_options subgroups_options = {
- .subgroup_size = 64,
- .ballot_bit_size = 64,
+ .subgroup_size = wave_size,
+ .ballot_bit_size = wave_size,
.lower_to_scalar = true,
.lower_subgroup_masks = true,
.lower_vote_trivial = false,
unsigned si_llvm_compile(LLVMModuleRef M, struct si_shader_binary *binary,
struct ac_llvm_compiler *compiler,
struct pipe_debug_callback *debug,
- bool less_optimized)
+ bool less_optimized, unsigned wave_size)
{
- struct ac_compiler_passes *passes =
- less_optimized && compiler->low_opt_passes ?
- compiler->low_opt_passes : compiler->passes;
+ struct ac_compiler_passes *passes = compiler->passes;
+
+ if (wave_size == 32)
+ passes = compiler->passes_wave32;
+ else if (less_optimized && compiler->low_opt_passes)
+ passes = compiler->low_opt_passes;
+
struct si_llvm_diagnostics diag;
LLVMContextRef llvm_ctx;
void si_llvm_context_init(struct si_shader_context *ctx,
struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler)
+ struct ac_llvm_compiler *compiler,
+ unsigned wave_size)
{
struct lp_type type;
AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH;
ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class,
- sscreen->info.family, float_mode, 64);
+ sscreen->info.family, float_mode, wave_size);
ctx->gallivm.context = ctx->ac.context;
ctx->gallivm.module = ctx->ac.module;
unsigned num_dwords_per_thread,
bool dst_stream_cache_policy, bool is_copy)
{
+ struct si_screen *sscreen = (struct si_screen *)ctx->screen;
assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
if (!ureg)
return NULL;
- ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, sscreen->compute_wave_size);
ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
values = malloc(num_mem_ops * sizeof(struct ureg_src));
}
- /* If there are multiple stores, the first store writes into 0+tid,
- * the 2nd store writes into 64+tid, the 3rd store writes into 128+tid, etc.
+ /* If there are multiple stores, the first store writes into 0*wavesize+tid,
+ * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.
*/
- ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, 64 * num_mem_ops), tid);
+ ureg_UMAD(ureg, store_addr, blk,
+ ureg_imm1u(ureg, sscreen->compute_wave_size * num_mem_ops), tid);
/* Convert from a "store size unit" into bytes. */
ureg_UMUL(ureg, store_addr, ureg_src(store_addr),
ureg_imm1u(ureg, 4 * inst_dwords[0]));
if (is_copy && i < num_mem_ops) {
if (i) {
ureg_UADD(ureg, load_addr, ureg_src(load_addr),
- ureg_imm1u(ureg, 4 * inst_dwords[i] * 64));
+ ureg_imm1u(ureg, 4 * inst_dwords[i] *
+ sscreen->compute_wave_size));
}
values[i] = ureg_src(ureg_DECL_temporary(ureg));
if (d >= 0) {
if (d) {
ureg_UADD(ureg, store_addr, ureg_src(store_addr),
- ureg_imm1u(ureg, 4 * inst_dwords[d] * 64));
+ ureg_imm1u(ureg, 4 * inst_dwords[d] *
+ sscreen->compute_wave_size));
}
struct ureg_dst dst =
* occupy significantly more CUs.
*/
unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch;
- if (temp_verts_per_tg > 64 && temp_verts_per_tg % 64 < 48)
- *num_patches = (temp_verts_per_tg & ~63) / max_verts_per_patch;
+ unsigned wave_size = sctx->screen->ge_wave_size;
+
+ if (temp_verts_per_tg > wave_size && temp_verts_per_tg % wave_size < wave_size*3/4)
+ *num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
if (sctx->chip_class == GFX6) {
/* GFX6 bug workaround, related to power management. Limit LS-HS
* threadgroups to only one wave.
*/
- unsigned one_wave = 64 / max_verts_per_patch;
+ unsigned one_wave = wave_size / max_verts_per_patch;
*num_patches = MIN2(*num_patches, one_wave);
}
}
si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
- S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) |
+ S_00B428_VGPRS((shader->config.num_vgprs - 1) /
+ (sscreen->ge_wave_size == 32 ? 8 : 4)) |
(sscreen->info.chip_class <= GFX9 ?
S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8) : 0) |
S_00B428_DX10_CLAMP(1) |
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
- S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
+ S_00B228_VGPRS((shader->config.num_vgprs - 1) /
+ (sscreen->ge_wave_size == 32 ? 8 : 4)) |
S_00B228_FLOAT_MODE(shader->config.float_mode) |
S_00B228_DX10_CLAMP(1) |
S_00B228_MEM_ORDERED(1) |
if (sscreen->info.chip_class >= GFX10)
si_set_ge_pc_alloc(sscreen, pm4, false);
- uint32_t rsrc1 = S_00B128_VGPRS((shader->config.num_vgprs - 1) / 4) |
+ uint32_t rsrc1 = S_00B128_VGPRS((shader->config.num_vgprs - 1) /
+ (sscreen->ge_wave_size == 32 ? 8 : 4)) |
S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
S_00B128_DX10_CLAMP(1) |
S_00B128_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
/* Set interpolation controls. */
- spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader));
+ spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) |
+ S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32);
shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl;
shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control;
si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40));
uint32_t rsrc1 =
- S_00B028_VGPRS((shader->config.num_vgprs - 1) / 4) |
+ S_00B028_VGPRS((shader->config.num_vgprs - 1) /
+ (sscreen->ps_wave_size == 32 ? 8 : 4)) |
S_00B028_DX10_CLAMP(1) |
S_00B028_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
S_00B028_FLOAT_MODE(shader->config.float_mode);
assert(thread_index < ARRAY_SIZE(sscreen->compiler));
compiler = &sscreen->compiler[thread_index];
- if (sel->nir)
- si_lower_nir(sel);
+ if (sel->nir) {
+ /* TODO: GS always sets wave size = default. Legacy GS will have
+ * incorrect subgroup_size and ballot_bit_size. */
+ si_lower_nir(sel, si_get_wave_size(sscreen, sel->type, true));
+ }
/* Compile the main shader part for use with a prolog and/or epilog.
* If this fails, the driver will try to compile a monolithic shader
if (screen->info.chip_class >= GFX9)
stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
+ if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
+ stages |= S_028B54_HS_W32_EN(1) |
+ S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
+ S_028B54_VS_W32_EN(1);
+ }
+
si_pm4_set_reg(pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
return pm4;
}