From 223b3174bd103d6a77309a4212516c837352a171 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 12 Aug 2019 20:37:11 -0400 Subject: [PATCH] radeonsi/nir: always lower ballot masks as 64-bit, codegen handles it This fixes KHR-GL45.shader_ballot_tests.ShaderBallotBitmasks. This solution is better, because the IR isn't dependent on wave32. --- src/amd/common/ac_llvm_build.c | 5 ++++- src/amd/common/ac_llvm_build.h | 6 +++++- src/amd/common/ac_nir_to_llvm.c | 2 ++ src/amd/vulkan/radv_nir_to_llvm.c | 5 +++-- src/gallium/drivers/radeonsi/si_compute.c | 2 +- src/gallium/drivers/radeonsi/si_shader.c | 18 ++++++++++++------ src/gallium/drivers/radeonsi/si_shader.h | 2 +- .../drivers/radeonsi/si_shader_internal.h | 3 ++- src/gallium/drivers/radeonsi/si_shader_nir.c | 7 +++---- .../drivers/radeonsi/si_shader_tgsi_setup.c | 5 +++-- .../drivers/radeonsi/si_state_shaders.c | 7 ++----- 11 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 823bf34acdb..05871f5ea98 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -60,7 +60,8 @@ void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler, enum chip_class chip_class, enum radeon_family family, - enum ac_float_mode float_mode, unsigned wave_size) + enum ac_float_mode float_mode, unsigned wave_size, + unsigned ballot_mask_bits) { LLVMValueRef args[1]; @@ -69,6 +70,7 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, ctx->chip_class = chip_class; ctx->family = family; ctx->wave_size = wave_size; + ctx->ballot_mask_bits = ballot_mask_bits; ctx->module = ac_create_module(wave_size == 32 ? compiler->tm_wave32 : compiler->tm, ctx->context); @@ -93,6 +95,7 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, ctx->v4f32 = LLVMVectorType(ctx->f32, 4); ctx->v8i32 = LLVMVectorType(ctx->i32, 8); ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size); + ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits); ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false); ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false); diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index 6848a7ca082..103c3b484dd 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -81,6 +81,7 @@ struct ac_llvm_context { LLVMTypeRef v4f32; LLVMTypeRef v8i32; LLVMTypeRef iN_wavemask; + LLVMTypeRef iN_ballotmask; LLVMValueRef i8_0; LLVMValueRef i8_1; @@ -114,7 +115,9 @@ struct ac_llvm_context { enum chip_class chip_class; enum radeon_family family; + unsigned wave_size; + unsigned ballot_mask_bits; LLVMValueRef lds; }; @@ -123,7 +126,8 @@ void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler, enum chip_class chip_class, enum radeon_family family, - enum ac_float_mode float_mode, unsigned wave_size); + enum ac_float_mode float_mode, unsigned wave_size, + unsigned ballot_mask_bits); void ac_llvm_context_dispose(struct ac_llvm_context *ctx); diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index ffac5ccea74..d97387ef13d 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3205,6 +3205,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx, switch (instr->intrinsic) { case nir_intrinsic_ballot: result = ac_build_ballot(&ctx->ac, get_src(ctx, instr->src[0])); + if (ctx->ac.ballot_mask_bits > ctx->ac.wave_size) + result = LLVMBuildZExt(ctx->ac.builder, result, ctx->ac.iN_ballotmask, ""); break; case nir_intrinsic_read_invocation: result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 7c3e840104d..c7c837d16f0 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -4320,7 +4320,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, AC_FLOAT_MODE_DEFAULT; ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, - options->family, float_mode, options->wave_size); + options->family, float_mode, options->wave_size, + options->wave_size); ctx.context = ctx.ac.context; radv_nir_shader_info_init(&shader_info->info); @@ -4834,7 +4835,7 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, AC_FLOAT_MODE_DEFAULT; ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, - options->family, float_mode, 64); + options->family, float_mode, 64, 64); ctx.context = ctx.ac.context; ctx.is_gs_copy_shader = true; diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index ad211ec0a41..a118ab246ae 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -128,7 +128,7 @@ static void si_create_compute_state_async(void *job, int thread_index) si_nir_opts(sel->nir); si_nir_scan_shader(sel->nir, &sel->info); - si_lower_nir(sel, sscreen->compute_wave_size); + si_lower_nir(sel); } /* Store the declared LDS size into tgsi_shader_info for the shader diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index ea749becdde..64d7ec08348 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -50,7 +50,8 @@ static const char scratch_rsrc_dword1_symbol[] = static void si_init_shader_ctx(struct si_shader_context *ctx, struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - unsigned wave_size); + unsigned wave_size, + bool nir); static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, @@ -5725,7 +5726,8 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, shader->is_gs_copy_shader = true; si_init_shader_ctx(&ctx, sscreen, compiler, - si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false)); + si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false), + false); ctx.shader = shader; ctx.type = PIPE_SHADER_VERTEX; @@ -5989,11 +5991,13 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f) static void si_init_shader_ctx(struct si_shader_context *ctx, struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - unsigned wave_size) + unsigned wave_size, + bool nir) { struct lp_build_tgsi_context *bld_base; - si_llvm_context_init(ctx, sscreen, compiler, wave_size); + si_llvm_context_init(ctx, sscreen, compiler, wave_size, + nir ? 64 : wave_size); bld_base = &ctx->bld_base; bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; @@ -6939,7 +6943,8 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, si_dump_streamout(&sel->so); } - si_init_shader_ctx(&ctx, sscreen, compiler, si_get_shader_wave_size(shader)); + si_init_shader_ctx(&ctx, sscreen, compiler, si_get_shader_wave_size(shader), + sel->nir != NULL); si_llvm_context_set_ir(&ctx, shader); memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, @@ -7319,7 +7324,8 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_context ctx; si_init_shader_ctx(&ctx, sscreen, compiler, si_get_wave_size(sscreen, type, shader.key.as_ngg, - shader.key.as_es)); + shader.key.as_es), + false); ctx.shader = &shader; ctx.type = type; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 2e4e4130e5c..502622d5199 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -757,7 +757,7 @@ void si_nir_scan_shader(const struct nir_shader *nir, void si_nir_scan_tess_ctrl(const struct nir_shader *nir, struct tgsi_tessctrl_info *out); void si_nir_lower_ps_inputs(struct nir_shader *nir); -void si_lower_nir(struct si_shader_selector *sel, unsigned wave_size); +void si_lower_nir(struct si_shader_selector *sel); void si_nir_opts(struct nir_shader *nir); /* si_state_shaders.c */ diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 5887d2632d7..b576d94a63f 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -279,7 +279,8 @@ LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - unsigned wave_size); + unsigned wave_size, + unsigned ballot_mask_bits); void si_llvm_context_set_ir(struct si_shader_context *ctx, struct si_shader *shader); diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index ea842452ca0..518d11c3b48 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -986,8 +986,7 @@ void si_nir_lower_ps_inputs(struct nir_shader *nir) * Perform "lowering" operations on the NIR that are run once when the shader * selector is created. */ -void -si_lower_nir(struct si_shader_selector* sel, unsigned wave_size) +void si_lower_nir(struct si_shader_selector *sel) { /* Adjust the driver location of inputs and outputs. The state tracker * interprets them as slots, while the ac/nir backend interprets them @@ -1023,8 +1022,8 @@ si_lower_nir(struct si_shader_selector* sel, unsigned wave_size) NIR_PASS_V(sel->nir, nir_lower_tex, &lower_tex_options); const nir_lower_subgroups_options subgroups_options = { - .subgroup_size = wave_size, - .ballot_bit_size = wave_size, + .subgroup_size = 64, + .ballot_bit_size = 64, .lower_to_scalar = true, .lower_subgroup_masks = true, .lower_vote_trivial = false, diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c index 2c1b3ebba77..39abd4b18f6 100644 --- a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c +++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c @@ -954,7 +954,8 @@ static void emit_immediate(struct lp_build_tgsi_context *bld_base, void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - unsigned wave_size) + unsigned wave_size, + unsigned ballot_mask_bits) { struct lp_type type; @@ -970,7 +971,7 @@ void si_llvm_context_init(struct si_shader_context *ctx, ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, sscreen->info.family, AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH, - wave_size); + wave_size, ballot_mask_bits); ctx->gallivm.context = ctx->ac.context; ctx->gallivm.module = ctx->ac.module; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 2a2b5077398..8934e06ed9a 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2427,11 +2427,8 @@ static void si_init_shader_selector_async(void *job, int thread_index) assert(thread_index < ARRAY_SIZE(sscreen->compiler)); compiler = &sscreen->compiler[thread_index]; - if (sel->nir) { - /* TODO: GS always sets wave size = default. Legacy GS will have - * incorrect subgroup_size and ballot_bit_size. */ - si_lower_nir(sel, si_get_wave_size(sscreen, sel->type, true, false)); - } + if (sel->nir) + si_lower_nir(sel); /* Compile the main shader part for use with a prolog and/or epilog. * If this fails, the driver will try to compile a monolithic shader -- 2.30.2