aco: Fix integer overflows when emitting parallel copies during RA
[mesa.git] / src / amd / vulkan / radv_nir_to_llvm.c
index 81b610ab137648ad7976034c13167daf94f670df..db21ad809b784e424da9b5839300c2979ce65210 100644 (file)
 #include "radv_shader.h"
 #include "radv_shader_helper.h"
 #include "radv_shader_args.h"
+#include "radv_debug.h"
 #include "nir/nir.h"
 
-#include <llvm-c/Core.h>
-#include <llvm-c/TargetMachine.h>
-#include <llvm-c/Transforms/Scalar.h>
-#include <llvm-c/Transforms/Utils.h>
-
 #include "sid.h"
 #include "ac_binary.h"
 #include "ac_llvm_util.h"
@@ -593,11 +589,12 @@ store_tcs_output(struct ac_shader_abi *abi,
                 LLVMValueRef param_index,
                 unsigned const_index,
                 LLVMValueRef src,
-                unsigned writemask)
+                unsigned writemask,
+                unsigned component,
+                unsigned driver_location)
 {
        struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
        const unsigned location = var->data.location;
-       unsigned component = var->data.location_frac;
        const bool is_patch = var->data.patch;
        const bool is_compact = var->data.compact;
        LLVMValueRef dw_addr;
@@ -880,39 +877,21 @@ static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
 
 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
                                     unsigned stream,
+                                    LLVMValueRef vertexidx,
                                     LLVMValueRef *addrs);
 
 static void
-visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
+visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream,
+                              LLVMValueRef vertexidx, LLVMValueRef *addrs)
 {
-       LLVMValueRef gs_next_vertex;
-       LLVMValueRef can_emit;
        unsigned offset = 0;
        struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
 
        if (ctx->args->options->key.vs_common_out.as_ngg) {
-               gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
+               gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs);
                return;
        }
 
-       /* Write vertex attribute values to GSVS ring */
-       gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
-                                      ctx->gs_next_vertex[stream],
-                                      "");
-
-       /* If this thread has already emitted the declared maximum number of
-        * vertices, don't emit any more: excessive vertex emissions are not
-        * supposed to have any effect.
-        */
-       can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
-                                LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
-
-       bool use_kill = !ctx->args->shader_info->gs.writes_memory;
-       if (use_kill)
-               ac_build_kill_if_false(&ctx->ac, can_emit);
-       else
-               ac_build_ifcc(&ctx->ac, can_emit, 6505);
-
        for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
                unsigned output_usage_mask =
                        ctx->args->shader_info->gs.output_usage_mask[i];
@@ -937,7 +916,7 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr
 
                        offset++;
 
-                       voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
+                       voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, "");
                        voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
 
                        out_val = ac_to_integer(&ctx->ac, out_val);
@@ -953,16 +932,9 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr
                }
        }
 
-       gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex,
-                                     ctx->ac.i32_1, "");
-       LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
-
        ac_build_sendmsg(&ctx->ac,
                         AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
                         ctx->gs_wave_id);
-
-       if (!use_kill)
-               ac_build_endif(&ctx->ac, 6505);
 }
 
 static void
@@ -1300,7 +1272,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
                 * access are detected. Only GFX6 and GFX10 are affected.
                 */
                bool unaligned_vertex_fetches = false;
-               if ((ctx->ac.chip_class == GFX6 || ctx->ac.chip_class == GFX10) &&
+               if ((ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10) &&
                    vtx_info->chan_format != data_format &&
                    ((attrib_offset % vtx_info->element_size) ||
                     (attrib_stride % vtx_info->element_size)))
@@ -1311,7 +1283,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
                        LLVMValueRef values[4];
 
                        assert(ctx->ac.chip_class == GFX6 ||
-                              ctx->ac.chip_class == GFX10);
+                              ctx->ac.chip_class >= GFX10);
 
                        for (unsigned chan  = 0; chan < num_channels; chan++) {
                                unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size;
@@ -1396,7 +1368,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 static void
 handle_vs_inputs(struct radv_shader_context *ctx,
                  struct nir_shader *nir) {
-       nir_foreach_variable(variable, &nir->inputs)
+       nir_foreach_shader_in_variable(variable, nir)
                handle_vs_input_decl(ctx, variable);
 }
 
@@ -1406,7 +1378,7 @@ prepare_interp_optimize(struct radv_shader_context *ctx,
 {
        bool uses_center = false;
        bool uses_centroid = false;
-       nir_foreach_variable(variable, &nir->inputs) {
+       nir_foreach_shader_in_variable(variable, nir) {
                if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
                    variable->data.sample)
                        continue;
@@ -1582,6 +1554,30 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
                        break;
                }
 
+               /* Replace NaN by zero (only 32-bit) to fix game bugs if
+                * requested.
+                */
+               if (ctx->args->options->enable_mrt_output_nan_fixup &&
+                   !is_16bit &&
+                   (col_format == V_028714_SPI_SHADER_32_R ||
+                    col_format == V_028714_SPI_SHADER_32_GR ||
+                    col_format == V_028714_SPI_SHADER_32_AR ||
+                    col_format == V_028714_SPI_SHADER_32_ABGR ||
+                    col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
+                       for (unsigned i = 0; i < 4; i++) {
+                               LLVMValueRef args[2] = {
+                                       values[i],
+                                       LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN, false)
+                               };
+                               LLVMValueRef isnan =
+                                       ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
+                                                          args, 2, AC_FUNC_ATTR_READNONE);
+                               values[i] = LLVMBuildSelect(ctx->ac.builder, isnan,
+                                                           ctx->ac.f32_0,
+                                                           values[i], "");
+                       }
+               }
+
                /* Pack f16 or norm_i16/u16. */
                if (packf) {
                        for (chan = 0; chan < 2; chan++) {
@@ -1801,6 +1797,7 @@ radv_build_param_exports(struct radv_shader_context *ctx,
 
                if (slot_name != VARYING_SLOT_LAYER &&
                    slot_name != VARYING_SLOT_PRIMITIVE_ID &&
+                   slot_name != VARYING_SLOT_VIEWPORT &&
                    slot_name != VARYING_SLOT_CLIP_DIST0 &&
                    slot_name != VARYING_SLOT_CLIP_DIST1 &&
                    slot_name < VARYING_SLOT_VAR0)
@@ -1920,12 +1917,10 @@ radv_llvm_export_vs(struct radv_shader_context *ctx,
                        outinfo->pos_exports++;
        }
 
-       /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
+       /* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
         * Setting valid_mask=1 prevents it and has no other effect.
         */
-       if (ctx->ac.family == CHIP_NAVI10 ||
-           ctx->ac.family == CHIP_NAVI12 ||
-           ctx->ac.family == CHIP_NAVI14)
+       if (ctx->ac.chip_class == GFX10)
                pos_args[0].valid_mask = 1;
 
        pos_idx = 0;
@@ -2322,8 +2317,7 @@ static void build_streamout_vertex(struct radv_shader_context *ctx,
                for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
                        unsigned output_usage_mask =
                                ctx->args->shader_info->gs.output_usage_mask[i];
-                       uint8_t output_stream =
-                               output_stream = ctx->args->shader_info->gs.output_streams[i];
+                       uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
 
                        if (!(ctx->output_mask & (1ull << i)) ||
                            output_stream != stream)
@@ -3313,25 +3307,11 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
 
 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
                                     unsigned stream,
+                                    LLVMValueRef vertexidx,
                                     LLVMValueRef *addrs)
 {
        LLVMBuilderRef builder = ctx->ac.builder;
        LLVMValueRef tmp;
-       const LLVMValueRef vertexidx =
-               LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
-
-       /* If this thread has already emitted the declared maximum number of
-        * vertices, skip the write: excessive vertex emissions are not
-        * supposed to have any effect.
-        */
-       const LLVMValueRef can_emit =
-               LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
-                             LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
-       ac_build_ifcc(&ctx->ac, can_emit, 9001);
-
-       tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
-       tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
-       LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
 
        const LLVMValueRef vertexptr =
                ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
@@ -3363,6 +3343,13 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
        }
        assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size);
 
+       /* Store the current number of emitted vertices to zero out remaining
+        * primitive flags in case the geometry shader doesn't emit the maximum
+        * number of vertices.
+        */
+       tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
+       LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
+
        /* Determine and store whether this vertex completed a primitive. */
        const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
 
@@ -3399,8 +3386,6 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
        tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
        tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
        LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
-
-       ac_build_endif(&ctx->ac, 9001);
 }
 
 static void
@@ -3742,7 +3727,7 @@ ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx)
        ac_optimize_vs_outputs(&ctx->ac,
                               ctx->main_function,
                               outinfo->vs_output_param_offset,
-                              VARYING_SLOT_MAX,
+                              VARYING_SLOT_MAX, 0,
                               &outinfo->param_exports);
 }
 
@@ -3751,7 +3736,7 @@ ac_setup_rings(struct radv_shader_context *ctx)
 {
        if (ctx->args->options->chip_class <= GFX8 &&
            (ctx->stage == MESA_SHADER_GEOMETRY ||
-            ctx->args->options->key.vs_common_out.as_es || ctx->args->options->key.vs_common_out.as_es)) {
+            ctx->args->options->key.vs_common_out.as_es)) {
                unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS
                                                                   : RING_ESGS_VS;
                LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
@@ -3929,7 +3914,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 
        ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class,
                             args->options->family, float_mode,
-                            args->shader_info->wave_size, 64);
+                            args->shader_info->wave_size,
+                            args->shader_info->ballot_bit_size);
        ctx.context = ctx.ac.context;
 
        ctx.max_workgroup_size = 0;
@@ -3951,7 +3937,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 
        ctx.abi.inputs = &ctx.inputs[0];
        ctx.abi.emit_outputs = handle_shader_outputs_post;
-       ctx.abi.emit_vertex = visit_emit_vertex;
+       ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter;
        ctx.abi.load_ubo = radv_load_ubo;
        ctx.abi.load_ssbo = radv_load_ssbo;
        ctx.abi.load_sampler_desc = radv_get_sampler_desc;
@@ -4042,13 +4028,15 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
                                ctx.tcs_num_inputs = args->options->key.tcs.num_inputs;
                        else
                                ctx.tcs_num_inputs = util_last_bit64(args->shader_info->vs.ls_outputs_written);
+                       unsigned tcs_num_outputs = util_last_bit64(ctx.args->shader_info->tcs.outputs_written);
+                       unsigned tcs_num_patch_outputs = util_last_bit64(ctx.args->shader_info->tcs.patch_outputs_written);
                        ctx.tcs_num_patches =
                                get_tcs_num_patches(
                                        ctx.args->options->key.tcs.input_vertices,
                                        ctx.shader->info.tess.tcs_vertices_out,
                                        ctx.tcs_num_inputs,
-                                       ctx.args->shader_info->tcs.outputs_written,
-                                       ctx.args->shader_info->tcs.patch_outputs_written,
+                                       tcs_num_outputs,
+                                       tcs_num_patch_outputs,
                                        ctx.args->options->tess_offchip_block_dw_size,
                                        ctx.args->options->chip_class,
                                        ctx.args->options->family);
@@ -4103,7 +4091,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
                        ac_emit_barrier(&ctx.ac, ctx.stage);
                }
 
-               nir_foreach_variable(variable, &shaders[i]->outputs)
+               nir_foreach_shader_out_variable(variable, shaders[i])
                        scan_shader_output_decl(&ctx, variable, shaders[i], shaders[i]->info.stage);
 
                ac_setup_rings(&ctx);
@@ -4152,15 +4140,18 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
                }
 
                if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) {
+                       unsigned tcs_num_outputs = util_last_bit64(ctx.args->shader_info->tcs.outputs_written);
+                       unsigned tcs_num_patch_outputs = util_last_bit64(ctx.args->shader_info->tcs.patch_outputs_written);
                        args->shader_info->tcs.num_patches = ctx.tcs_num_patches;
-                       args->shader_info->tcs.lds_size =
+                       args->shader_info->tcs.num_lds_blocks =
                                calculate_tess_lds_size(
+                                       ctx.args->options->chip_class,
                                        ctx.args->options->key.tcs.input_vertices,
                                        ctx.shader->info.tess.tcs_vertices_out,
                                        ctx.tcs_num_inputs,
                                        ctx.tcs_num_patches,
-                                       ctx.args->shader_info->tcs.outputs_written,
-                                       ctx.args->shader_info->tcs.patch_outputs_written);
+                                       tcs_num_outputs,
+                                       tcs_num_patch_outputs);
                }
        }
 
@@ -4271,7 +4262,7 @@ static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm,
        free(elf_buffer);
 }
 
-void
+static void
 radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm,
                        struct radv_shader_binary **rbinary,
                        const struct radv_shader_args *args,
@@ -4397,7 +4388,7 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
        LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb);
 }
 
-void
+static void
 radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
                            struct nir_shader *geom_shader,
                            struct radv_shader_binary **rbinary,
@@ -4419,7 +4410,7 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
 
        ac_setup_rings(&ctx);
 
-       nir_foreach_variable(variable, &geom_shader->outputs) {
+       nir_foreach_shader_out_variable(variable, geom_shader) {
                scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
                ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader,
                                             variable, MESA_SHADER_VERTEX);
@@ -4436,3 +4427,34 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
        (*rbinary)->is_gs_copy_shader = true;
        
 }
+
+void
+llvm_compile_shader(struct radv_device *device,
+                   unsigned shader_count,
+                   struct nir_shader *const *shaders,
+                   struct radv_shader_binary **binary,
+                   struct radv_shader_args *args)
+{
+       enum ac_target_machine_options tm_options = 0;
+       struct ac_llvm_compiler ac_llvm;
+       bool thread_compiler;
+
+       tm_options |= AC_TM_SUPPORTS_SPILL;
+       if (args->options->check_ir)
+               tm_options |= AC_TM_CHECK_IR;
+
+       thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM);
+
+       radv_init_llvm_compiler(&ac_llvm, thread_compiler,
+                               args->options->family, tm_options,
+                               args->shader_info->wave_size);
+
+       if (args->is_gs_copy_shader) {
+               radv_compile_gs_copy_shader(&ac_llvm, *shaders, binary, args);
+       } else {
+               radv_compile_nir_shader(&ac_llvm, binary, args,
+                                       shaders, shader_count);
+       }
+
+       radv_destroy_llvm_compiler(&ac_llvm, thread_compiler);
+}