radv: hardcode the number of waves for the GFX6 LS-HS bug
[mesa.git] / src / amd / vulkan / radv_nir_to_llvm.c
index ee4acf7b5a787d5cbe18b676d02ac8197dc0dded..5b4ccc60e7237ea19d5b0d9089d27c3fad55684f 100644 (file)
@@ -190,7 +190,7 @@ get_tcs_num_patches(struct radv_shader_context *ctx)
 
        /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */
        if (ctx->options->chip_class == GFX6) {
-               unsigned one_wave = ctx->options->wave_size / MAX2(num_tcs_input_cp, num_tcs_output_cp);
+               unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
                num_patches = MIN2(num_patches, one_wave);
        }
        return num_patches;
@@ -1601,6 +1601,18 @@ load_tes_input(struct ac_shader_abi *abi,
        return result;
 }
 
+static LLVMValueRef
+radv_emit_fetch_64bit(struct radv_shader_context *ctx,
+                     LLVMTypeRef type, LLVMValueRef a, LLVMValueRef b)
+{
+       LLVMValueRef values[2] = {
+               ac_to_integer(&ctx->ac, a),
+               ac_to_integer(&ctx->ac, b),
+       };
+       LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
+       return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
+}
+
 static LLVMValueRef
 load_gs_input(struct ac_shader_abi *abi,
              unsigned location,
@@ -1629,6 +1641,14 @@ load_gs_input(struct ac_shader_abi *abi,
                        dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
                                               LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index, 0), "");
                        value[i] = ac_lds_load(&ctx->ac, dw_addr);
+
+                       if (ac_get_type_size(type) == 8) {
+                               dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
+                                                      LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index + 1, 0), "");
+                               LLVMValueRef tmp = ac_lds_load(&ctx->ac, dw_addr);
+
+                               value[i] = radv_emit_fetch_64bit(ctx, type, value[i], tmp);
+                       }
                } else {
                        LLVMValueRef soffset =
                                LLVMConstInt(ctx->ac.i32,
@@ -1640,6 +1660,21 @@ load_gs_input(struct ac_shader_abi *abi,
                                                        ctx->ac.i32_0,
                                                        vtx_offset, soffset,
                                                        0, ac_glc, true, false);
+
+                       if (ac_get_type_size(type) == 8) {
+                               soffset = LLVMConstInt(ctx->ac.i32,
+                                                      (param * 4 + i + const_index + 1) * 256,
+                                                      false);
+
+                               LLVMValueRef tmp =
+                                       ac_build_buffer_load(&ctx->ac,
+                                                            ctx->esgs_ring, 1,
+                                                            ctx->ac.i32_0,
+                                                            vtx_offset, soffset,
+                                                            0, ac_glc, true, false);
+
+                               value[i] = radv_emit_fetch_64bit(ctx, type, value[i], tmp);
+                       }
                }
 
                if (ac_get_type_size(type) == 2) {
@@ -1761,13 +1796,17 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr
                                       "");
 
        /* If this thread has already emitted the declared maximum number of
-        * vertices, kill it: excessive vertex emissions are not supposed to
-        * have any effect, and GS threads have no externally observable
-        * effects other than emitting vertices.
+        * vertices, don't emit any more: excessive vertex emissions are not
+        * supposed to have any effect.
         */
        can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
                                 LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
-       ac_build_kill_if_false(&ctx->ac, can_emit);
+
+       bool use_kill = !ctx->shader_info->gs.writes_memory;
+       if (use_kill)
+               ac_build_kill_if_false(&ctx->ac, can_emit);
+       else
+               ac_build_ifcc(&ctx->ac, can_emit, 6505);
 
        for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
                unsigned output_usage_mask =
@@ -1814,6 +1853,9 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr
        ac_build_sendmsg(&ctx->ac,
                         AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
                         ctx->gs_wave_id);
+
+       if (!use_kill)
+               ac_build_endif(&ctx->ac, 6505);
 }
 
 static void
@@ -3133,6 +3175,17 @@ static void build_export_prim(struct radv_shader_context *ctx,
        ac_build_export(&ctx->ac, &args);
 }
 
+static struct radv_stream_output *
+radv_get_stream_output_by_loc(struct radv_streamout_info *so, unsigned location)
+{
+       for (unsigned i = 0; i < so->num_outputs; ++i) {
+               if (so->outputs[i].location == location)
+                       return &so->outputs[i];
+       }
+
+       return NULL;
+}
+
 static void build_streamout_vertex(struct radv_shader_context *ctx,
                                   LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw,
                                   unsigned stream, LLVMValueRef offset_vtx,
@@ -3153,23 +3206,79 @@ static void build_streamout_vertex(struct radv_shader_context *ctx,
                offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
        }
 
-       for (unsigned i = 0; i < so->num_outputs; ++i) {
-               struct radv_stream_output *output =
-                       &ctx->shader_info->so.outputs[i];
+       if (ctx->stage == MESA_SHADER_GEOMETRY) {
+               struct radv_shader_output_values outputs[AC_LLVM_MAX_OUTPUTS];
+               unsigned noutput = 0;
+               unsigned out_idx = 0;
 
-               if (stream != output->stream)
-                       continue;
+               for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
+                       unsigned output_usage_mask =
+                               ctx->shader_info->gs.output_usage_mask[i];
+                       uint8_t output_stream =
+                               output_stream = ctx->shader_info->gs.output_streams[i];
 
-               unsigned loc = output->location;
-               struct radv_shader_output_values out = {};
+                       if (!(ctx->output_mask & (1ull << i)) ||
+                           output_stream != stream)
+                               continue;
 
-               for (unsigned comp = 0; comp < 4; comp++) {
-                       tmp = ac_build_gep0(&ctx->ac, vertexptr,
-                                           LLVMConstInt(ctx->ac.i32, 4 * loc + comp, false));
-                       out.values[comp] = LLVMBuildLoad(builder, tmp, "");
+                       outputs[noutput].slot_name = i;
+                       outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
+                       outputs[noutput].usage_mask = output_usage_mask;
+
+                       int length = util_last_bit(output_usage_mask);
+
+                       for (unsigned j = 0; j < length; j++, out_idx++) {
+                               if (!(output_usage_mask & (1 << j)))
+                                       continue;
+
+                               tmp = ac_build_gep0(&ctx->ac, vertexptr,
+                                                   LLVMConstInt(ctx->ac.i32, out_idx, false));
+                               outputs[noutput].values[j] = LLVMBuildLoad(builder, tmp, "");
+                       }
+
+                       for (unsigned j = length; j < 4; j++)
+                               outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
+
+                       noutput++;
+               }
+
+               for (unsigned i = 0; i < noutput; i++) {
+                       struct radv_stream_output *output =
+                               radv_get_stream_output_by_loc(so, outputs[i].slot_name);
+
+                       if (!output ||
+                           output->stream != stream)
+                               continue;
+
+                       struct radv_shader_output_values out = {};
+
+                       for (unsigned j = 0; j < 4; j++) {
+                               out.values[j] = outputs[i].values[j];
+                       }
+
+                       radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
                }
+       } else {
+               for (unsigned i = 0; i < so->num_outputs; ++i) {
+                       struct radv_stream_output *output =
+                               &ctx->shader_info->so.outputs[i];
+
+                       if (stream != output->stream)
+                               continue;
+
+                       struct radv_shader_output_values out = {};
+
+                       for (unsigned comp = 0; comp < 4; comp++) {
+                               if (!(output->component_mask & (1 << comp)))
+                                       continue;
+
+                               tmp = ac_build_gep0(&ctx->ac, vertexptr,
+                                                   LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
+                               out.values[comp] = LLVMBuildLoad(builder, tmp, "");
+                       }
 
-               radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
+                       radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
+               }
        }
 }
 
@@ -3534,6 +3643,7 @@ static LLVMValueRef ngg_nogs_vertex_ptr(struct radv_shader_context *ctx,
 static void
 handle_ngg_outputs_post_1(struct radv_shader_context *ctx)
 {
+       struct radv_streamout_info *so = &ctx->shader_info->so;
        LLVMBuilderRef builder = ctx->ac.builder;
        LLVMValueRef vertex_ptr = NULL;
        LLVMValueRef tmp, tmp2;
@@ -3546,15 +3656,20 @@ handle_ngg_outputs_post_1(struct radv_shader_context *ctx)
 
        vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
 
-       for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
-               if (!(ctx->output_mask & (1ull << i)))
-                       continue;
+       for (unsigned i = 0; i < so->num_outputs; ++i) {
+               struct radv_stream_output *output =
+                       &ctx->shader_info->so.outputs[i];
+
+               unsigned loc = output->location;
+
+               for (unsigned comp = 0; comp < 4; comp++) {
+                       if (!(output->component_mask & (1 << comp)))
+                               continue;
 
-               for (unsigned j = 0; j < 4; j++) {
                        tmp = ac_build_gep0(&ctx->ac, vertex_ptr,
-                                           LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
+                                           LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
                        tmp2 = LLVMBuildLoad(builder,
-                                            ctx->abi.outputs[4 * i + j], "");
+                                            ctx->abi.outputs[4 * loc + comp], "");
                        tmp2 = ac_to_integer(&ctx->ac, tmp2);
                        LLVMBuildStore(builder, tmp2, tmp);
                }
@@ -3587,7 +3702,11 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
        LLVMValueRef num_vertices_val;
 
        if (ctx->stage == MESA_SHADER_VERTEX) {
-               num_vertices_val = LLVMConstInt(ctx->ac.i32, 1, false);
+               LLVMValueRef outprim_val =
+                       LLVMConstInt(ctx->ac.i32,
+                                    ctx->options->key.vs.outprim, false);
+               num_vertices_val = LLVMBuildAdd(builder, outprim_val,
+                                               ctx->ac.i32_1, "");
                num_vertices = 3; /* TODO: optimize for points & lines */
        } else {
                assert(ctx->stage == MESA_SHADER_TESS_EVAL);
@@ -4083,7 +4202,7 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
        const LLVMValueRef can_emit =
                LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
                              LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
-       ac_build_kill_if_false(&ctx->ac, can_emit);
+       ac_build_ifcc(&ctx->ac, can_emit, 9001);
 
        tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
        tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
@@ -4149,6 +4268,8 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
        tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
        tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
        LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
+
+       ac_build_endif(&ctx->ac, 9001);
 }
 
 static void
@@ -4648,9 +4769,13 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
        ctx.options = options;
        ctx.shader_info = shader_info;
 
-       enum ac_float_mode float_mode =
-               options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
-                                      AC_FLOAT_MODE_DEFAULT;
+       enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT;
+
+       if (shader_info->float_controls_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) {
+               float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO;
+       } else if (options->unsafe_math) {
+               float_mode = AC_FLOAT_MODE_UNSAFE_FP_MATH;
+       }
 
        ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class,
                             options->family, float_mode, options->wave_size, 64);
@@ -4697,11 +4822,9 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
            shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
                ac_nir_fixup_ls_hs_input_vgprs(&ctx);
 
-       if (shaders[shader_count - 1]->info.stage != MESA_SHADER_GEOMETRY &&
-           (ctx.options->key.vs_common_out.as_ngg &&
-            !ctx.options->key.vs_common_out.as_es)) {
-               /* Unconditionally declare scratch space base for streamout and
-                * vertex compaction. Whether space is actually allocated is
+       if (is_ngg) {
+               /* Declare scratch space base for streamout and vertex
+                * compaction. Whether space is actually allocated is
                 * determined during linking / PM4 creation.
                 *
                 * Add an extra dword per vertex to ensure an odd stride, which
@@ -4942,7 +5065,7 @@ static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm,
                fprintf(stderr, "\n");
        }
 
-       if (options->record_llvm_ir) {
+       if (options->record_ir) {
                char *llvm_ir = LLVMPrintModuleToString(llvm_module);
                llvm_ir_string = strdup(llvm_ir);
                LLVMDisposeMessage(llvm_ir);
@@ -5035,7 +5158,7 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
                LLVMBasicBlockRef bb;
                unsigned offset;
 
-               if (!num_components)
+               if (stream > 0 && !num_components)
                        continue;
 
                if (stream > 0 && !ctx->shader_info->so.num_outputs)