radeonsi/nir: always lower ballot masks as 64-bit, codegen handles it
authorMarek Olšák <marek.olsak@amd.com>
Tue, 13 Aug 2019 00:37:11 +0000 (20:37 -0400)
committerMarek Olšák <marek.olsak@amd.com>
Mon, 19 Aug 2019 21:23:38 +0000 (17:23 -0400)
This fixes KHR-GL45.shader_ballot_tests.ShaderBallotBitmasks.

This solution is better, because the IR isn't dependent on wave32.

src/amd/common/ac_llvm_build.c
src/amd/common/ac_llvm_build.h
src/amd/common/ac_nir_to_llvm.c
src/amd/vulkan/radv_nir_to_llvm.c
src/gallium/drivers/radeonsi/si_compute.c
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_shader_internal.h
src/gallium/drivers/radeonsi/si_shader_nir.c
src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
src/gallium/drivers/radeonsi/si_state_shaders.c

index 823bf34acdbcf9b67fa3ff511698bb5af09eb94a..05871f5ea98abdacf4019392b351ce21d10d8f01 100644 (file)
@@ -60,7 +60,8 @@ void
 ac_llvm_context_init(struct ac_llvm_context *ctx,
                     struct ac_llvm_compiler *compiler,
                     enum chip_class chip_class, enum radeon_family family,
-                    enum ac_float_mode float_mode, unsigned wave_size)
+                    enum ac_float_mode float_mode, unsigned wave_size,
+                    unsigned ballot_mask_bits)
 {
        LLVMValueRef args[1];
 
@@ -69,6 +70,7 @@ ac_llvm_context_init(struct ac_llvm_context *ctx,
        ctx->chip_class = chip_class;
        ctx->family = family;
        ctx->wave_size = wave_size;
+       ctx->ballot_mask_bits = ballot_mask_bits;
        ctx->module = ac_create_module(wave_size == 32 ? compiler->tm_wave32
                                                       : compiler->tm,
                                       ctx->context);
@@ -93,6 +95,7 @@ ac_llvm_context_init(struct ac_llvm_context *ctx,
        ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
        ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
        ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
+       ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
 
        ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
        ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
index 6848a7ca082f6cf733be6079363bbfd15b0ffd7b..103c3b484dd7f0155266e9f0a3279add3d2ac527 100644 (file)
@@ -81,6 +81,7 @@ struct ac_llvm_context {
        LLVMTypeRef v4f32;
        LLVMTypeRef v8i32;
        LLVMTypeRef iN_wavemask;
+       LLVMTypeRef iN_ballotmask;
 
        LLVMValueRef i8_0;
        LLVMValueRef i8_1;
@@ -114,7 +115,9 @@ struct ac_llvm_context {
 
        enum chip_class chip_class;
        enum radeon_family family;
+
        unsigned wave_size;
+       unsigned ballot_mask_bits;
 
        LLVMValueRef lds;
 };
@@ -123,7 +126,8 @@ void
 ac_llvm_context_init(struct ac_llvm_context *ctx,
                     struct ac_llvm_compiler *compiler,
                     enum chip_class chip_class, enum radeon_family family,
-                    enum ac_float_mode float_mode, unsigned wave_size);
+                    enum ac_float_mode float_mode, unsigned wave_size,
+                    unsigned ballot_mask_bits);
 
 void
 ac_llvm_context_dispose(struct ac_llvm_context *ctx);
index ffac5ccea744a0e0e662703e73dc1a0320482f72..d97387ef13d275daafdf9d87eb645e1b792a73c2 100644 (file)
@@ -3205,6 +3205,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
        switch (instr->intrinsic) {
        case nir_intrinsic_ballot:
                result = ac_build_ballot(&ctx->ac, get_src(ctx, instr->src[0]));
+               if (ctx->ac.ballot_mask_bits > ctx->ac.wave_size)
+                       result = LLVMBuildZExt(ctx->ac.builder, result, ctx->ac.iN_ballotmask, "");
                break;
        case nir_intrinsic_read_invocation:
                result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]),
index 7c3e840104d9ce6e1a3d845be5e6f857ff3b35fe..c7c837d16f0c6bdb4d03923b220ebb231f463519 100644 (file)
@@ -4320,7 +4320,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
                                       AC_FLOAT_MODE_DEFAULT;
 
        ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class,
-                            options->family, float_mode, options->wave_size);
+                            options->family, float_mode, options->wave_size,
+                            options->wave_size);
        ctx.context = ctx.ac.context;
 
        radv_nir_shader_info_init(&shader_info->info);
@@ -4834,7 +4835,7 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
                                       AC_FLOAT_MODE_DEFAULT;
 
        ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class,
-                            options->family, float_mode, 64);
+                            options->family, float_mode, 64, 64);
        ctx.context = ctx.ac.context;
 
        ctx.is_gs_copy_shader = true;
index ad211ec0a41534365220fbc8cdea9a98c4e1a660..a118ab246aeedcdf5afb160953149d22a17636f1 100644 (file)
@@ -128,7 +128,7 @@ static void si_create_compute_state_async(void *job, int thread_index)
 
                si_nir_opts(sel->nir);
                si_nir_scan_shader(sel->nir, &sel->info);
-               si_lower_nir(sel, sscreen->compute_wave_size);
+               si_lower_nir(sel);
        }
 
        /* Store the declared LDS size into tgsi_shader_info for the shader
index ea749becdde2aaddb6739f1cabcffcf3daaf9ec8..64d7ec08348f9f7454b12e2ece0351ff13c628c2 100644 (file)
@@ -50,7 +50,8 @@ static const char scratch_rsrc_dword1_symbol[] =
 static void si_init_shader_ctx(struct si_shader_context *ctx,
                               struct si_screen *sscreen,
                               struct ac_llvm_compiler *compiler,
-                              unsigned wave_size);
+                              unsigned wave_size,
+                              bool nir);
 
 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
                                 struct lp_build_tgsi_context *bld_base,
@@ -5725,7 +5726,8 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
        shader->is_gs_copy_shader = true;
 
        si_init_shader_ctx(&ctx, sscreen, compiler,
-                          si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false));
+                          si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false),
+                          false);
        ctx.shader = shader;
        ctx.type = PIPE_SHADER_VERTEX;
 
@@ -5989,11 +5991,13 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
 static void si_init_shader_ctx(struct si_shader_context *ctx,
                               struct si_screen *sscreen,
                               struct ac_llvm_compiler *compiler,
-                              unsigned wave_size)
+                              unsigned wave_size,
+                              bool nir)
 {
        struct lp_build_tgsi_context *bld_base;
 
-       si_llvm_context_init(ctx, sscreen, compiler, wave_size);
+       si_llvm_context_init(ctx, sscreen, compiler, wave_size,
+                            nir ? 64 : wave_size);
 
        bld_base = &ctx->bld_base;
        bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
@@ -6939,7 +6943,8 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                si_dump_streamout(&sel->so);
        }
 
-       si_init_shader_ctx(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
+       si_init_shader_ctx(&ctx, sscreen, compiler, si_get_shader_wave_size(shader),
+                          sel->nir != NULL);
        si_llvm_context_set_ir(&ctx, shader);
 
        memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
@@ -7319,7 +7324,8 @@ si_get_shader_part(struct si_screen *sscreen,
        struct si_shader_context ctx;
        si_init_shader_ctx(&ctx, sscreen, compiler,
                           si_get_wave_size(sscreen, type, shader.key.as_ngg,
-                                           shader.key.as_es));
+                                           shader.key.as_es),
+                          false);
        ctx.shader = &shader;
        ctx.type = type;
 
index 2e4e4130e5cb29393240ecbf0eff376c16891f31..502622d519960809ff7c6f01b5c5022a30217725 100644 (file)
@@ -757,7 +757,7 @@ void si_nir_scan_shader(const struct nir_shader *nir,
 void si_nir_scan_tess_ctrl(const struct nir_shader *nir,
                           struct tgsi_tessctrl_info *out);
 void si_nir_lower_ps_inputs(struct nir_shader *nir);
-void si_lower_nir(struct si_shader_selector *sel, unsigned wave_size);
+void si_lower_nir(struct si_shader_selector *sel);
 void si_nir_opts(struct nir_shader *nir);
 
 /* si_state_shaders.c */
index 5887d2632d7339ed94bf0d51ed0399616d4820d1..b576d94a63f2488376be1d88c6a38013de30dccc 100644 (file)
@@ -279,7 +279,8 @@ LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx,
 void si_llvm_context_init(struct si_shader_context *ctx,
                          struct si_screen *sscreen,
                          struct ac_llvm_compiler *compiler,
-                         unsigned wave_size);
+                         unsigned wave_size,
+                         unsigned ballot_mask_bits);
 void si_llvm_context_set_ir(struct si_shader_context *ctx,
                            struct si_shader *shader);
 
index ea842452ca0397f238a9128c845c6803a0a74417..518d11c3b48a8a5e8e8b6d152da941e61c9aeeec 100644 (file)
@@ -986,8 +986,7 @@ void si_nir_lower_ps_inputs(struct nir_shader *nir)
  * Perform "lowering" operations on the NIR that are run once when the shader
  * selector is created.
  */
-void
-si_lower_nir(struct si_shader_selector* sel, unsigned wave_size)
+void si_lower_nir(struct si_shader_selector *sel)
 {
        /* Adjust the driver location of inputs and outputs. The state tracker
         * interprets them as slots, while the ac/nir backend interprets them
@@ -1023,8 +1022,8 @@ si_lower_nir(struct si_shader_selector* sel, unsigned wave_size)
        NIR_PASS_V(sel->nir, nir_lower_tex, &lower_tex_options);
 
        const nir_lower_subgroups_options subgroups_options = {
-               .subgroup_size = wave_size,
-               .ballot_bit_size = wave_size,
+               .subgroup_size = 64,
+               .ballot_bit_size = 64,
                .lower_to_scalar = true,
                .lower_subgroup_masks = true,
                .lower_vote_trivial = false,
index 2c1b3ebba77c9427325388dbc34ea2652228523d..39abd4b18f63aa730da1ef09c7de037e94276cb7 100644 (file)
@@ -954,7 +954,8 @@ static void emit_immediate(struct lp_build_tgsi_context *bld_base,
 void si_llvm_context_init(struct si_shader_context *ctx,
                          struct si_screen *sscreen,
                          struct ac_llvm_compiler *compiler,
-                         unsigned wave_size)
+                         unsigned wave_size,
+                         unsigned ballot_mask_bits)
 {
        struct lp_type type;
 
@@ -970,7 +971,7 @@ void si_llvm_context_init(struct si_shader_context *ctx,
        ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class,
                             sscreen->info.family,
                             AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH,
-                            wave_size);
+                            wave_size, ballot_mask_bits);
 
        ctx->gallivm.context = ctx->ac.context;
        ctx->gallivm.module = ctx->ac.module;
index 2a2b50773989afb0d983301db8fc0f4b3fb586fb..8934e06ed9a76cae101402fe6741090ee0ad6fce 100644 (file)
@@ -2427,11 +2427,8 @@ static void si_init_shader_selector_async(void *job, int thread_index)
        assert(thread_index < ARRAY_SIZE(sscreen->compiler));
        compiler = &sscreen->compiler[thread_index];
 
-       if (sel->nir) {
-               /* TODO: GS always sets wave size = default. Legacy GS will have
-                * incorrect subgroup_size and ballot_bit_size. */
-               si_lower_nir(sel, si_get_wave_size(sscreen, sel->type, true, false));
-       }
+       if (sel->nir)
+               si_lower_nir(sel);
 
        /* Compile the main shader part for use with a prolog and/or epilog.
         * If this fails, the driver will try to compile a monolithic shader