From: Marek Olšák Date: Fri, 12 Jul 2019 21:37:29 +0000 (-0400) Subject: radeonsi/gfx10: implement Wave32 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=88efb63cafcf249a370ee27683c41afde3b95ffd;p=mesa.git radeonsi/gfx10: implement Wave32 Acked-by: Pierre-Eric Pelloux-Prayer Reviewed-by: Samuel Pitoiset --- diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 6b3c1017fb2..c4bdff35deb 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -44,7 +44,7 @@ static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx) LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef tmp; tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx), - LLVMConstInt(ctx->ac.i32, 64, false), ""); + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""); return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), ""); } @@ -1047,7 +1047,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); - numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, 64); + numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size); tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->i32_0, ""); ac_build_ifcc(&ctx->ac, tmp, 5105); @@ -1423,7 +1423,7 @@ void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) /* Round up towards full wave sizes for better ALU utilization. */ if (!max_vert_out_per_gs_instance) { - const unsigned wavesize = 64; + const unsigned wavesize = gs_sel->screen->ge_wave_size; unsigned orig_max_esverts; unsigned orig_max_gsprims; do { diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 97f692a18dc..12cbe194f63 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -68,7 +68,7 @@ static const amd_kernel_code_t *si_compute_get_code_object( if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){ .info = &sel->screen->info, .shader_type = MESA_SHADER_COMPUTE, - .wave_size = 64, + .wave_size = sel->screen->compute_wave_size, .num_parts = 1, .elf_ptrs = &program->shader.binary.elf_buffer, .elf_sizes = &program->shader.binary.elf_size })) @@ -127,7 +127,7 @@ static void si_create_compute_state_async(void *job, int thread_index) si_nir_opts(sel->nir); si_nir_scan_shader(sel->nir, &sel->info); - si_lower_nir(sel); + si_lower_nir(sel, sscreen->compute_wave_size); } /* Store the declared LDS size into tgsi_shader_info for the shader @@ -178,7 +178,8 @@ static void si_create_compute_state_async(void *job, int thread_index) program->num_cs_user_data_dwords; shader->config.rsrc1 = - S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) | + S_00B848_VGPRS((shader->config.num_vgprs - 1) / + (sscreen->compute_wave_size == 32 ? 8 : 4)) | S_00B848_DX10_CLAMP(1) | S_00B848_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | S_00B848_WGP_MODE(sscreen->info.chip_class >= GFX10) | @@ -746,7 +747,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, unsigned threads_per_threadgroup = info->block[0] * info->block[1] * info->block[2]; unsigned waves_per_threadgroup = - DIV_ROUND_UP(threads_per_threadgroup, 64); + DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size); unsigned threadgroups_per_cu = 1; if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1) @@ -763,7 +764,8 @@ static void si_emit_dispatch_packets(struct si_context *sctx, S_00B800_FORCE_START_AT_000(1) | /* If the KMD allows it (there is a KMD hw register for it), * allow launching waves out-of-order. (same as Vulkan) */ - S_00B800_ORDER_MODE(sctx->chip_class >= GFX7); + S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) | + S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32); const uint *last_block = info->last_block; bool partial_block_en = last_block[0] || last_block[1] || last_block[2]; diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 240baa4b164..7d5124e51b2 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -117,13 +117,14 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD; unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4); unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread; - unsigned dwords_per_wave = dwords_per_thread * 64; + unsigned wave_size = sctx->screen->compute_wave_size; + unsigned dwords_per_wave = dwords_per_thread * wave_size; unsigned num_dwords = size / 4; unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); struct pipe_grid_info info = {}; - info.block[0] = MIN2(64, num_instructions); + info.block[0] = MIN2(wave_size, num_instructions); info.block[1] = 1; info.block[2] = 1; info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index 6eddfcbe8dc..630d749856d 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -939,12 +939,13 @@ static void si_add_split_disasm(struct si_screen *screen, uint64_t *addr, unsigned *num, struct si_shader_inst *instructions, - enum pipe_shader_type shader_type) + enum pipe_shader_type shader_type, + unsigned wave_size) { if (!ac_rtld_open(rtld_binary, (struct ac_rtld_open_info){ .info = &screen->info, .shader_type = tgsi_processor_to_shader_stage(shader_type), - .wave_size = 64, + .wave_size = wave_size, .num_parts = 1, .elf_ptrs = &binary->elf_buffer, .elf_sizes = &binary->elf_size })) @@ -1016,27 +1017,28 @@ static void si_print_annotated_shader(struct si_shader *shader, */ unsigned num_inst = 0; uint64_t inst_addr = start_addr; + unsigned wave_size = si_get_shader_wave_size(shader); struct ac_rtld_binary rtld_binaries[5] = {}; struct si_shader_inst *instructions = calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst)); if (shader->prolog) { si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary, - &inst_addr, &num_inst, instructions, shader_type); + &inst_addr, &num_inst, instructions, shader_type, wave_size); } if (shader->previous_stage) { si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary, - &inst_addr, &num_inst, instructions, shader_type); + &inst_addr, &num_inst, instructions, shader_type, wave_size); } if (shader->prolog2) { si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary, - &inst_addr, &num_inst, instructions, shader_type); + &inst_addr, &num_inst, instructions, shader_type, wave_size); } si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary, - &inst_addr, &num_inst, instructions, shader_type); + &inst_addr, &num_inst, instructions, shader_type, wave_size); if (shader->epilog) { si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary, - &inst_addr, &num_inst, instructions, shader_type); + &inst_addr, &num_inst, instructions, shader_type, wave_size); } fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n", diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index b784d4ad0e9..0e93038b7a5 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -883,7 +883,7 @@ static int si_get_compute_param(struct pipe_screen *screen, case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: if (ret) { uint32_t *subgroup_size = ret; - *subgroup_size = 64; + *subgroup_size = sscreen->compute_wave_size; } return sizeof(uint32_t); case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index f929d07fcf6..7b2c2dc2ee9 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -137,6 +137,8 @@ static void si_init_compiler(struct si_screen *sscreen, ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options); compiler->passes = ac_create_llvm_passes(compiler->tm); + if (compiler->tm_wave32) + compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32); if (compiler->low_opt_tm) compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm); } @@ -1212,6 +1214,10 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws, for (i = 0; i < num_comp_lo_threads; i++) si_init_compiler(sscreen, &sscreen->compiler_lowp[i]); + sscreen->ge_wave_size = 64; + sscreen->ps_wave_size = 64; + sscreen->compute_wave_size = 64; + /* Create the auxiliary context. This must be done last. */ sscreen->aux_context = si_create_context( &sscreen->b, sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 0c4bab941b3..47cfac4705b 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -602,6 +602,10 @@ struct si_screen { /* Use at most 2 low priority threads on quadcore and better. * We want to minimize the impact on multithreaded Mesa. */ struct ac_llvm_compiler compiler_lowp[10]; + + unsigned compute_wave_size; + unsigned ps_wave_size; + unsigned ge_wave_size; }; struct si_blend_color { @@ -1889,6 +1893,26 @@ static inline bool si_compute_prim_discard_enabled(struct si_context *sctx) return sctx->prim_discard_vertex_count_threshold != UINT_MAX; } +static inline unsigned si_get_wave_size(struct si_screen *sscreen, + enum pipe_shader_type shader_type, + bool ngg) +{ + if (shader_type == PIPE_SHADER_COMPUTE) + return sscreen->compute_wave_size; + else if (shader_type == PIPE_SHADER_FRAGMENT) + return sscreen->ps_wave_size; + else if (shader_type == PIPE_SHADER_GEOMETRY && !ngg) /* legacy GS only supports Wave64 */ + return 64; + else + return sscreen->ge_wave_size; +} + +static inline unsigned si_get_shader_wave_size(struct si_shader *shader) +{ + return si_get_wave_size(shader->selector->screen, shader->selector->type, + shader->key.as_ngg); +} + #define PRINT_ERR(fmt, args...) \ fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 23a6a7455ec..4bdaa7f08fe 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -49,7 +49,8 @@ static const char scratch_rsrc_dword1_symbol[] = static void si_init_shader_ctx(struct si_shader_context *ctx, struct si_screen *sscreen, - struct ac_llvm_compiler *compiler); + struct ac_llvm_compiler *compiler, + unsigned wave_size); static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, @@ -2168,7 +2169,7 @@ void si_load_system_value(struct si_shader_context *ctx, break; case TGSI_SEMANTIC_SUBGROUP_SIZE: - value = LLVMConstInt(ctx->i32, 64, 0); + value = LLVMConstInt(ctx->i32, ctx->ac.wave_size, 0); break; case TGSI_SEMANTIC_SUBGROUP_INVOCATION: @@ -3555,7 +3556,7 @@ static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4); vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, LLVMBuildMul(ctx->ac.builder, wave_idx, - LLVMConstInt(ctx->i32, 64, false), ""), ""); + LLVMConstInt(ctx->i32, ctx->ac.wave_size, false), ""), ""); lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->i32, itemsize_dw, 0), ""); } @@ -5137,14 +5138,14 @@ static void preload_ring_buffers(struct si_shader_context *ctx) /* Limit on the stride field for <= GFX7. */ assert(stride < (1 << 14)); - num_records = 64; + num_records = ctx->ac.wave_size; ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, ""); tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->i64, stream_offset, 0), ""); - stream_offset += stride * 64; + stream_offset += stride * ctx->ac.wave_size; ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, ""); ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, ""); @@ -5270,7 +5271,7 @@ static bool si_shader_binary_open(struct si_screen *screen, .halt_at_entry = screen->options.halt_shaders, }, .shader_type = tgsi_processor_to_shader_stage(sel->type), - .wave_size = 64, + .wave_size = si_get_shader_wave_size(shader), .num_parts = num_parts, .elf_ptrs = part_elfs, .elf_sizes = part_sizes, @@ -5357,6 +5358,7 @@ bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader static void si_shader_dump_disassembly(struct si_screen *screen, const struct si_shader_binary *binary, enum pipe_shader_type shader_type, + unsigned wave_size, struct pipe_debug_callback *debug, const char *name, FILE *file) { @@ -5365,7 +5367,7 @@ static void si_shader_dump_disassembly(struct si_screen *screen, if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){ .info = &screen->info, .shader_type = tgsi_processor_to_shader_stage(shader_type), - .wave_size = 64, + .wave_size = wave_size, .num_parts = 1, .elf_ptrs = &binary->elf_buffer, .elf_sizes = &binary->elf_size })) @@ -5449,7 +5451,8 @@ static void si_calculate_max_simd_waves(struct si_shader *shader) unsigned max_workgroup_size = si_get_max_workgroup_size(shader); lds_per_wave = (conf->lds_size * lds_increment) / - DIV_ROUND_UP(max_workgroup_size, 64); + DIV_ROUND_UP(max_workgroup_size, + sscreen->compute_wave_size); } break; default:; @@ -5482,6 +5485,7 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, if (screen->options.debug_disassembly) si_shader_dump_disassembly(screen, &shader->binary, shader->selector->type, + si_get_shader_wave_size(shader), debug, "main", NULL); pipe_debug_message(debug, SHADER_INFO, @@ -5594,23 +5598,26 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, if (!check_debug_option || (si_can_dump_shader(sscreen, shader_type) && !(sscreen->debug_flags & DBG(NO_ASM)))) { + unsigned wave_size = si_get_shader_wave_size(shader); + fprintf(file, "\n%s:\n", si_get_shader_name(shader)); if (shader->prolog) si_shader_dump_disassembly(sscreen, &shader->prolog->binary, - shader_type, debug, "prolog", file); + shader_type, wave_size, debug, "prolog", file); if (shader->previous_stage) si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, - shader_type, debug, "previous stage", file); + shader_type, wave_size, debug, "previous stage", file); if (shader->prolog2) si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, - shader_type, debug, "prolog2", file); + shader_type, wave_size, debug, "prolog2", file); - si_shader_dump_disassembly(sscreen, &shader->binary, shader_type, debug, "main", file); + si_shader_dump_disassembly(sscreen, &shader->binary, shader_type, + wave_size, debug, "main", file); if (shader->epilog) si_shader_dump_disassembly(sscreen, &shader->epilog->binary, - shader_type, debug, "epilog", file); + shader_type, wave_size, debug, "epilog", file); fprintf(file, "\n"); } @@ -5624,6 +5631,7 @@ static int si_compile_llvm(struct si_screen *sscreen, LLVMModuleRef mod, struct pipe_debug_callback *debug, enum pipe_shader_type shader_type, + unsigned wave_size, const char *name, bool less_optimized) { @@ -5647,7 +5655,7 @@ static int si_compile_llvm(struct si_screen *sscreen, if (!si_replace_shader(count, binary)) { unsigned r = si_llvm_compile(mod, binary, compiler, debug, - less_optimized); + less_optimized, wave_size); if (r) return r; } @@ -5656,7 +5664,7 @@ static int si_compile_llvm(struct si_screen *sscreen, if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){ .info = &sscreen->info, .shader_type = tgsi_processor_to_shader_stage(shader_type), - .wave_size = 64, + .wave_size = wave_size, .num_parts = 1, .elf_ptrs = &binary->elf_buffer, .elf_sizes = &binary->elf_size })) @@ -5718,7 +5726,8 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, shader->selector = gs_selector; shader->is_gs_copy_shader = true; - si_init_shader_ctx(&ctx, sscreen, compiler); + si_init_shader_ctx(&ctx, sscreen, compiler, + si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false)); ctx.shader = shader; ctx.type = PIPE_SHADER_VERTEX; @@ -5817,7 +5826,7 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, ctx.ac.module, - debug, PIPE_SHADER_GEOMETRY, + debug, PIPE_SHADER_GEOMETRY, ctx.ac.wave_size, "GS Copy Shader", false) == 0) { if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY)) fprintf(stderr, "GS Copy Shader:\n"); @@ -5972,11 +5981,12 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f) static void si_init_shader_ctx(struct si_shader_context *ctx, struct si_screen *sscreen, - struct ac_llvm_compiler *compiler) + struct ac_llvm_compiler *compiler, + unsigned wave_size) { struct lp_build_tgsi_context *bld_base; - si_llvm_context_init(ctx, sscreen, compiler); + si_llvm_context_init(ctx, sscreen, compiler, wave_size); bld_base = &ctx->bld_base; bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; @@ -6917,7 +6927,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, si_dump_streamout(&sel->so); } - si_init_shader_ctx(&ctx, sscreen, compiler); + si_init_shader_ctx(&ctx, sscreen, compiler, si_get_shader_wave_size(shader)); si_llvm_context_set_tgsi(&ctx, shader); memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, @@ -7133,7 +7143,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, /* Compile to bytecode. */ r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, - ctx.ac.module, debug, ctx.type, + ctx.ac.module, debug, ctx.type, ctx.ac.wave_size, si_get_shader_name(shader), si_should_optimize_less(compiler, shader->selector)); si_llvm_dispose(&ctx); @@ -7146,7 +7156,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, * LLVM 3.9svn has this bug. */ if (sel->type == PIPE_SHADER_COMPUTE) { - unsigned wave_size = 64; + unsigned wave_size = sscreen->compute_wave_size; unsigned max_vgprs = 256; unsigned max_sgprs = sscreen->info.chip_class >= GFX8 ? 800 : 512; unsigned max_sgprs_per_wave = 128; @@ -7294,7 +7304,8 @@ si_get_shader_part(struct si_screen *sscreen, } struct si_shader_context ctx; - si_init_shader_ctx(&ctx, sscreen, compiler); + si_init_shader_ctx(&ctx, sscreen, compiler, + si_get_wave_size(sscreen, type, shader.key.as_ngg)); ctx.shader = &shader; ctx.type = type; @@ -7304,7 +7315,8 @@ si_get_shader_part(struct si_screen *sscreen, si_llvm_optimize_module(&ctx); if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler, - ctx.ac.module, debug, ctx.type, name, false)) { + ctx.ac.module, debug, ctx.type, ctx.ac.wave_size, + name, false)) { FREE(result); result = NULL; goto out; @@ -8224,7 +8236,7 @@ static void si_fix_resource_usage(struct si_screen *sscreen, shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); if (shader->selector->type == PIPE_SHADER_COMPUTE && - si_get_max_workgroup_size(shader) > 64) { + si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) { si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size); } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 15698bcddeb..1b4f29bac8a 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -772,7 +772,7 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct tgsi_shader_info *info); void si_nir_scan_tess_ctrl(const struct nir_shader *nir, struct tgsi_tessctrl_info *out); -void si_lower_nir(struct si_shader_selector *sel); +void si_lower_nir(struct si_shader_selector *sel, unsigned wave_size); void si_nir_opts(struct nir_shader *nir); /* si_state_shaders.c */ diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 90e3f9844a8..aa4e083ec1a 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -263,7 +263,7 @@ void si_create_function(struct si_shader_context *ctx, unsigned si_llvm_compile(LLVMModuleRef M, struct si_shader_binary *binary, struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug, - bool less_optimized); + bool less_optimized, unsigned wave_size); LLVMTypeRef tgsi2llvmtype(struct lp_build_tgsi_context *bld_base, enum tgsi_opcode_type type); @@ -277,7 +277,8 @@ LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen, - struct ac_llvm_compiler *compiler); + struct ac_llvm_compiler *compiler, + unsigned wave_size); void si_llvm_context_set_tgsi(struct si_shader_context *ctx, struct si_shader *shader); diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 743827307ba..0f964bda88f 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -901,7 +901,7 @@ si_nir_lower_color(nir_shader *nir) * selector is created. */ void -si_lower_nir(struct si_shader_selector* sel) +si_lower_nir(struct si_shader_selector* sel, unsigned wave_size) { /* Adjust the driver location of inputs and outputs. The state tracker * interprets them as slots, while the ac/nir backend interprets them @@ -956,8 +956,8 @@ si_lower_nir(struct si_shader_selector* sel) NIR_PASS_V(sel->nir, nir_lower_tex, &lower_tex_options); const nir_lower_subgroups_options subgroups_options = { - .subgroup_size = 64, - .ballot_bit_size = 64, + .subgroup_size = wave_size, + .ballot_bit_size = wave_size, .lower_to_scalar = true, .lower_subgroup_masks = true, .lower_vote_trivial = false, diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c index c534a445db4..c5f400d8791 100644 --- a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c +++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c @@ -83,11 +83,15 @@ static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context) unsigned si_llvm_compile(LLVMModuleRef M, struct si_shader_binary *binary, struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug, - bool less_optimized) + bool less_optimized, unsigned wave_size) { - struct ac_compiler_passes *passes = - less_optimized && compiler->low_opt_passes ? - compiler->low_opt_passes : compiler->passes; + struct ac_compiler_passes *passes = compiler->passes; + + if (wave_size == 32) + passes = compiler->passes_wave32; + else if (less_optimized && compiler->low_opt_passes) + passes = compiler->low_opt_passes; + struct si_llvm_diagnostics diag; LLVMContextRef llvm_ctx; @@ -949,7 +953,8 @@ static void emit_immediate(struct lp_build_tgsi_context *bld_base, void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen, - struct ac_llvm_compiler *compiler) + struct ac_llvm_compiler *compiler, + unsigned wave_size) { struct lp_type type; @@ -968,7 +973,7 @@ void si_llvm_context_init(struct si_shader_context *ctx, AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH; ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, - sscreen->info.family, float_mode, 64); + sscreen->info.family, float_mode, wave_size); ctx->gallivm.context = ctx->ac.context; ctx->gallivm.module = ctx->ac.module; diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index 9f2f9d30216..15bb475d89a 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -124,6 +124,7 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread, bool dst_stream_cache_policy, bool is_copy) { + struct si_screen *sscreen = (struct si_screen *)ctx->screen; assert(util_is_power_of_two_nonzero(num_dwords_per_thread)); unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT; @@ -145,7 +146,7 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx, if (!ureg) return NULL; - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, sscreen->compute_wave_size); ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1); ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); @@ -168,10 +169,11 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx, values = malloc(num_mem_ops * sizeof(struct ureg_src)); } - /* If there are multiple stores, the first store writes into 0+tid, - * the 2nd store writes into 64+tid, the 3rd store writes into 128+tid, etc. + /* If there are multiple stores, the first store writes into 0*wavesize+tid, + * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc. */ - ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, 64 * num_mem_ops), tid); + ureg_UMAD(ureg, store_addr, blk, + ureg_imm1u(ureg, sscreen->compute_wave_size * num_mem_ops), tid); /* Convert from a "store size unit" into bytes. */ ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0])); @@ -186,7 +188,8 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx, if (is_copy && i < num_mem_ops) { if (i) { ureg_UADD(ureg, load_addr, ureg_src(load_addr), - ureg_imm1u(ureg, 4 * inst_dwords[i] * 64)); + ureg_imm1u(ureg, 4 * inst_dwords[i] * + sscreen->compute_wave_size)); } values[i] = ureg_src(ureg_DECL_temporary(ureg)); @@ -201,7 +204,8 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx, if (d >= 0) { if (d) { ureg_UADD(ureg, store_addr, ureg_src(store_addr), - ureg_imm1u(ureg, 4 * inst_dwords[d] * 64)); + ureg_imm1u(ureg, 4 * inst_dwords[d] * + sscreen->compute_wave_size)); } struct ureg_dst dst = diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 33153368738..dca0840a693 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -183,14 +183,16 @@ static void si_emit_derived_tess_state(struct si_context *sctx, * occupy significantly more CUs. */ unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch; - if (temp_verts_per_tg > 64 && temp_verts_per_tg % 64 < 48) - *num_patches = (temp_verts_per_tg & ~63) / max_verts_per_patch; + unsigned wave_size = sctx->screen->ge_wave_size; + + if (temp_verts_per_tg > wave_size && temp_verts_per_tg % wave_size < wave_size*3/4) + *num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch; if (sctx->chip_class == GFX6) { /* GFX6 bug workaround, related to power management. Limit LS-HS * threadgroups to only one wave. */ - unsigned one_wave = 64 / max_verts_per_patch; + unsigned one_wave = wave_size / max_verts_per_patch; *num_patches = MIN2(*num_patches, one_wave); } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 40572acc23a..48b8b7368e3 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -553,7 +553,8 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) } si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS, - S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) | + S_00B428_VGPRS((shader->config.num_vgprs - 1) / + (sscreen->ge_wave_size == 32 ? 8 : 4)) | (sscreen->info.chip_class <= GFX9 ? S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8) : 0) | S_00B428_DX10_CLAMP(1) | @@ -1153,7 +1154,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40); si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, - S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | + S_00B228_VGPRS((shader->config.num_vgprs - 1) / + (sscreen->ge_wave_size == 32 ? 8 : 4)) | S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) | S_00B228_MEM_ORDERED(1) | @@ -1399,7 +1401,8 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, if (sscreen->info.chip_class >= GFX10) si_set_ge_pc_alloc(sscreen, pm4, false); - uint32_t rsrc1 = S_00B128_VGPRS((shader->config.num_vgprs - 1) / 4) | + uint32_t rsrc1 = S_00B128_VGPRS((shader->config.num_vgprs - 1) / + (sscreen->ge_wave_size == 32 ? 8 : 4)) | S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B128_DX10_CLAMP(1) | S_00B128_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | @@ -1610,7 +1613,8 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr; /* Set interpolation controls. */ - spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)); + spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) | + S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32); shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl; shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control; @@ -1627,7 +1631,8 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40)); uint32_t rsrc1 = - S_00B028_VGPRS((shader->config.num_vgprs - 1) / 4) | + S_00B028_VGPRS((shader->config.num_vgprs - 1) / + (sscreen->ps_wave_size == 32 ? 8 : 4)) | S_00B028_DX10_CLAMP(1) | S_00B028_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | S_00B028_FLOAT_MODE(shader->config.float_mode); @@ -2421,8 +2426,11 @@ static void si_init_shader_selector_async(void *job, int thread_index) assert(thread_index < ARRAY_SIZE(sscreen->compiler)); compiler = &sscreen->compiler[thread_index]; - if (sel->nir) - si_lower_nir(sel); + if (sel->nir) { + /* TODO: GS always sets wave size = default. Legacy GS will have + * incorrect subgroup_size and ballot_bit_size. */ + si_lower_nir(sel, si_get_wave_size(sscreen, sel->type, true)); + } /* Compile the main shader part for use with a prolog and/or epilog. * If this fails, the driver will try to compile a monolithic shader @@ -3785,6 +3793,12 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, if (screen->info.chip_class >= GFX9) stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2); + if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) { + stages |= S_028B54_HS_W32_EN(1) | + S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */ + S_028B54_VS_W32_EN(1); + } + si_pm4_set_reg(pm4, R_028B54_VGT_SHADER_STAGES_EN, stages); return pm4; }