radv: make use of has_ls_vgpr_init_bug
[mesa.git] / src / amd / vulkan / radv_nir_to_llvm.c
index b87258d469521314beeb8053e30a887fd2cd2161..fbfe12da562826fbcd8c48b6cd07198c308a5302 100644 (file)
@@ -55,7 +55,7 @@ struct radv_shader_context {
        LLVMContextRef context;
        LLVMValueRef main_function;
 
-       LLVMValueRef descriptor_sets[RADV_UD_MAX_SETS];
+       LLVMValueRef descriptor_sets[MAX_SETS];
        LLVMValueRef ring_offsets;
 
        LLVMValueRef vertex_buffers;
@@ -125,6 +125,13 @@ struct radv_shader_context {
        LLVMValueRef vertexptr; /* GFX10 only */
 };
 
+struct radv_shader_output_values {
+       LLVMValueRef values[4];
+       unsigned slot_name;
+       unsigned slot_index;
+       unsigned usage_mask;
+};
+
 enum radeon_llvm_calling_convention {
        RADEON_LLVM_AMDGPU_VS = 87,
        RADEON_LLVM_AMDGPU_GS = 88,
@@ -288,7 +295,7 @@ get_tcs_num_patches(struct radv_shader_context *ctx)
 
        /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */
        if (ctx->options->chip_class == GFX6) {
-               unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
+               unsigned one_wave = ctx->options->wave_size / MAX2(num_tcs_input_cp, num_tcs_output_cp);
                num_patches = MIN2(num_patches, one_wave);
        }
        return num_patches;
@@ -845,9 +852,15 @@ declare_vs_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args)
                        }
                } else {
                        if (ctx->ac.chip_class >= GFX10) {
-                               add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
-                               add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
-                               add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+                               if (ctx->options->key.vs_common_out.as_ngg) {
+                                       add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
+                                       add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
+                                       add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+                               } else {
+                                       add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
+                                       add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
+                                       add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+                               }
                        } else {
                                add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
                                add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
@@ -863,9 +876,6 @@ declare_streamout_sgprs(struct radv_shader_context *ctx, gl_shader_stage stage,
 {
        int i;
 
-       if (ctx->ac.chip_class >= GFX10)
-               return;
-
        /* Streamout SGPRs. */
        if (ctx->shader_info->info.so.num_outputs) {
                assert(stage == MESA_SHADER_VERTEX ||
@@ -1366,9 +1376,16 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index,
                uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
                        S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
                        S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                       S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
-                       S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+                       S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+               if (ctx->ac.chip_class >= GFX10) {
+                       desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                                    S_008F0C_OOB_SELECT(3) |
+                                    S_008F0C_RESOURCE_LEVEL(1);
+               } else {
+                       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+               }
 
                LLVMValueRef desc_components[4] = {
                        LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.intptr, ""),
@@ -2606,10 +2623,10 @@ static void
 radv_emit_stream_output(struct radv_shader_context *ctx,
                         LLVMValueRef const *so_buffers,
                         LLVMValueRef const *so_write_offsets,
-                        const struct radv_stream_output *output)
+                        const struct radv_stream_output *output,
+                        struct radv_shader_output_values *shader_out)
 {
        unsigned num_comps = util_bitcount(output->component_mask);
-       unsigned loc = output->location;
        unsigned buf = output->buffer;
        unsigned offset = output->offset;
        unsigned start;
@@ -2624,8 +2641,7 @@ radv_emit_stream_output(struct radv_shader_context *ctx,
 
        /* Load the output as int. */
        for (int i = 0; i < num_comps; i++) {
-               out[i] = ac_to_integer(&ctx->ac,
-                                      radv_load_output(ctx, loc, start + i));
+               out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
        }
 
        /* Pack the output. */
@@ -2722,26 +2738,25 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
 
                /* Write streamout data. */
                for (i = 0; i < ctx->shader_info->info.so.num_outputs; i++) {
+                       struct radv_shader_output_values shader_out = {};
                        struct radv_stream_output *output =
                                &ctx->shader_info->info.so.outputs[i];
 
                        if (stream != output->stream)
                                continue;
 
-                       radv_emit_stream_output(ctx, so_buffers,
-                                               so_write_offset, output);
+                       for (int j = 0; j < 4; j++) {
+                               shader_out.values[j] =
+                                       radv_load_output(ctx, output->location, j);
+                       }
+
+                       radv_emit_stream_output(ctx, so_buffers,so_write_offset,
+                                               output, &shader_out);
                }
        }
        ac_nir_build_endif(&if_ctx);
 }
 
-struct radv_shader_output_values {
-       LLVMValueRef values[4];
-       unsigned slot_name;
-       unsigned slot_index;
-       unsigned usage_mask;
-};
-
 static void
 radv_build_param_exports(struct radv_shader_context *ctx,
                         struct radv_shader_output_values *outputs,
@@ -3023,7 +3038,8 @@ handle_es_outputs_post(struct radv_shader_context *ctx,
                LLVMValueRef wave_idx = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4);
                vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
                                         LLVMBuildMul(ctx->ac.builder, wave_idx,
-                                                     LLVMConstInt(ctx->ac.i32, 64, false), ""), "");
+                                                     LLVMConstInt(ctx->ac.i32,
+                                                                  ctx->ac.wave_size, false), ""), "");
                lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
                                        LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
        }
@@ -3125,7 +3141,7 @@ static LLVMValueRef get_thread_id_in_tg(struct radv_shader_context *ctx)
        LLVMBuilderRef builder = ctx->ac.builder;
        LLVMValueRef tmp;
        tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
-                          LLVMConstInt(ctx->ac.i32, 64, false), "");
+                          LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
        return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
 }
 
@@ -4175,7 +4191,7 @@ ac_setup_rings(struct radv_shader_context *ctx)
                 */
                LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
                uint64_t stream_offset = 0;
-               unsigned num_records = 64;
+               unsigned num_records = ctx->ac.wave_size;
                LLVMValueRef base_ring;
 
                base_ring =
@@ -4208,7 +4224,7 @@ ac_setup_rings(struct radv_shader_context *ctx)
                        ring = LLVMBuildInsertElement(ctx->ac.builder,
                                                      ring, tmp, ctx->ac.i32_0, "");
 
-                       stream_offset += stride * 64;
+                       stream_offset += stride * ctx->ac.wave_size;
 
                        ring = LLVMBuildBitCast(ctx->ac.builder, ring,
                                                ctx->ac.v4i32, "");
@@ -4239,23 +4255,11 @@ ac_setup_rings(struct radv_shader_context *ctx)
 
 unsigned
 radv_nir_get_max_workgroup_size(enum chip_class chip_class,
+                               gl_shader_stage stage,
                                const struct nir_shader *nir)
 {
-       switch (nir->info.stage) {
-       case MESA_SHADER_TESS_CTRL:
-               return chip_class >= GFX7 ? 128 : 64;
-       case MESA_SHADER_GEOMETRY:
-               return chip_class >= GFX9 ? 128 : 64;
-       case MESA_SHADER_COMPUTE:
-               break;
-       default:
-               return 0;
-       }
-
-       unsigned max_workgroup_size = nir->info.cs.local_size[0] *
-               nir->info.cs.local_size[1] *
-               nir->info.cs.local_size[2];
-       return max_workgroup_size;
+       const unsigned backup_sizes[] = {chip_class >= GFX9 ? 128 : 64, 1, 1};
+       return radv_get_max_workgroup_size(chip_class, stage, nir ? nir->info.cs.local_size : backup_sizes);
 }
 
 /* Fixup the HW not emitting the TCS regs if there are no HS threads. */
@@ -4311,22 +4315,21 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
        ctx.options = options;
        ctx.shader_info = shader_info;
 
-       ac_llvm_context_init(&ctx.ac, options->chip_class, options->family);
-       ctx.context = ctx.ac.context;
-       ctx.ac.module = ac_create_module(ac_llvm->tm, ctx.context);
-
        enum ac_float_mode float_mode =
                options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
                                       AC_FLOAT_MODE_DEFAULT;
 
-       ctx.ac.builder = ac_create_builder(ctx.context, float_mode);
+       ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class,
+                            options->family, float_mode, options->wave_size,
+                            options->wave_size);
+       ctx.context = ctx.ac.context;
 
        radv_nir_shader_info_init(&shader_info->info);
 
        for(int i = 0; i < shader_count; ++i)
                radv_nir_shader_info_pass(shaders[i], options, &shader_info->info);
 
-       for (i = 0; i < RADV_UD_MAX_SETS; i++)
+       for (i = 0; i < MAX_SETS; i++)
                shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
        for (i = 0; i < AC_UD_MAX_UD; i++)
                shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
@@ -4335,7 +4338,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
        for (int i = 0; i < shader_count; ++i) {
                ctx.max_workgroup_size = MAX2(ctx.max_workgroup_size,
                                              radv_nir_get_max_workgroup_size(ctx.options->chip_class,
-                                                                           shaders[i]));
+                                                                             shaders[i]->info.stage,
+                                                                             shaders[i]));
        }
 
        if (ctx.ac.chip_class >= GFX10) {
@@ -4356,20 +4360,13 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
        ctx.abi.load_sampler_desc = radv_get_sampler_desc;
        ctx.abi.load_resource = radv_load_resource;
        ctx.abi.clamp_shadow_reference = false;
-       ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x800;
-
-       /* Because the new raw/struct atomic intrinsics are buggy with LLVM 8,
-        * we fallback to the old intrinsics for atomic buffer image operations
-        * and thus we need to apply the indexing workaround...
-        */
-       ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900;
+       ctx.abi.robust_buffer_access = options->robust_buffer_access;
 
        bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) &&  ctx.options->key.vs_common_out.as_ngg;
        if (shader_count >= 2 || is_ngg)
                ac_init_exec_full_mask(&ctx.ac);
 
-       if ((ctx.ac.family == CHIP_VEGA10 ||
-            ctx.ac.family == CHIP_RAVEN) &&
+       if (options->has_ls_vgpr_init_bug &&
            shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
                ac_nir_fixup_ls_hs_input_vgprs(&ctx);
 
@@ -4443,8 +4440,38 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
                        declare_esgs_ring(&ctx);
                }
 
-               if (i)
+               bool nested_barrier = false;
+
+               if (i) {
+                       if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY &&
+                           ctx.options->key.vs_common_out.as_ngg) {
+                               gfx10_ngg_gs_emit_prologue(&ctx);
+                               nested_barrier = false;
+                       } else {
+                               nested_barrier = true;
+                       }
+               }
+
+               if (nested_barrier) {
+                       /* Execute a barrier before the second shader in
+                        * a merged shader.
+                        *
+                        * Execute the barrier inside the conditional block,
+                        * so that empty waves can jump directly to s_endpgm,
+                        * which will also signal the barrier.
+                        *
+                        * This is possible in gfx9, because an empty wave
+                        * for the second shader does not participate in
+                        * the epilogue. With NGG, empty waves may still
+                        * be required to export data (e.g. GS output vertices),
+                        * so we cannot let them exit early.
+                        *
+                        * If the shader is TCS and the TCS epilog is present
+                        * and contains a barrier, it will wait there and then
+                        * reach s_endpgm.
+                       */
                        ac_emit_barrier(&ctx.ac, ctx.stage);
+               }
 
                nir_foreach_variable(variable, &shaders[i]->outputs)
                        scan_shader_output_decl(&ctx, variable, shaders[i], shaders[i]->info.stage);
@@ -4461,12 +4488,6 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 
                LLVMBasicBlockRef merge_block;
                if (shader_count >= 2 || is_ngg) {
-
-                       if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY &&
-                           ctx.options->key.vs_common_out.as_ngg) {
-                               gfx10_ngg_gs_emit_prologue(&ctx);
-                       }
-
                        LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
                        LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
                        merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
@@ -4632,6 +4653,7 @@ ac_fill_shader_info(struct radv_shader_variant_info *shader_info, struct nir_sha
                 break;
         case MESA_SHADER_FRAGMENT:
                 shader_info->fs.early_fragment_test = nir->info.fs.early_fragment_tests;
+                shader_info->fs.post_depth_coverage = nir->info.fs.post_depth_coverage;
                 break;
         case MESA_SHADER_GEOMETRY:
                 shader_info->gs.vertices_in = nir->info.gs.vertices_in;
@@ -4692,6 +4714,7 @@ radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm,
                        shader_info->gs.es_type = nir[0]->info.stage;
                }
        }
+       shader_info->info.wave_size = options->wave_size;
 }
 
 static void
@@ -4799,17 +4822,15 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
        ctx.options = options;
        ctx.shader_info = shader_info;
 
-       ac_llvm_context_init(&ctx.ac, options->chip_class, options->family);
-       ctx.context = ctx.ac.context;
-       ctx.ac.module = ac_create_module(ac_llvm->tm, ctx.context);
-
-       ctx.is_gs_copy_shader = true;
-
        enum ac_float_mode float_mode =
                options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
                                       AC_FLOAT_MODE_DEFAULT;
 
-       ctx.ac.builder = ac_create_builder(ctx.context, float_mode);
+       ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class,
+                            options->family, float_mode, 64, 64);
+       ctx.context = ctx.ac.context;
+
+       ctx.is_gs_copy_shader = true;
        ctx.stage = MESA_SHADER_VERTEX;
 
        radv_nir_shader_info_pass(geom_shader, options, &shader_info->info);