radv: hardcode the number of waves for the GFX6 LS-HS bug

[mesa.git] / src / amd / vulkan / radv_nir_to_llvm.c
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c

index ee4acf7b5a787d5cbe18b676d02ac8197dc0dded..5b4ccc60e7237ea19d5b0d9089d27c3fad55684f 100644 (file)
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -190,7 +190,7 @@ get_tcs_num_patches(struct radv_shader_context *ctx)
  
         /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */
         if (ctx->options->chip_class == GFX6) {
-               unsigned one_wave = ctx->options->wave_size / MAX2(num_tcs_input_cp, num_tcs_output_cp);
+               unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
                 num_patches = MIN2(num_patches, one_wave);
         }
         return num_patches;
@@ -1601,6 +1601,18 @@ load_tes_input(struct ac_shader_abi *abi,
         return result;
  }
  
+static LLVMValueRef
+radv_emit_fetch_64bit(struct radv_shader_context *ctx,
+                     LLVMTypeRef type, LLVMValueRef a, LLVMValueRef b)
+{
+       LLVMValueRef values[2] = {
+               ac_to_integer(&ctx->ac, a),
+               ac_to_integer(&ctx->ac, b),
+       };
+       LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
+       return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
+}
+
  static LLVMValueRef
  load_gs_input(struct ac_shader_abi *abi,
               unsigned location,
@@ -1629,6 +1641,14 @@ load_gs_input(struct ac_shader_abi *abi,
                         dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
                                                LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index, 0), "");
                         value[i] = ac_lds_load(&ctx->ac, dw_addr);
+
+                       if (ac_get_type_size(type) == 8) {
+                               dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
+                                                      LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index + 1, 0), "");
+                               LLVMValueRef tmp = ac_lds_load(&ctx->ac, dw_addr);
+
+                               value[i] = radv_emit_fetch_64bit(ctx, type, value[i], tmp);
+                       }
                 } else {
                         LLVMValueRef soffset =
                                 LLVMConstInt(ctx->ac.i32,
@@ -1640,6 +1660,21 @@ load_gs_input(struct ac_shader_abi *abi,
                                                         ctx->ac.i32_0,
                                                         vtx_offset, soffset,
                                                         0, ac_glc, true, false);
+
+                       if (ac_get_type_size(type) == 8) {
+                               soffset = LLVMConstInt(ctx->ac.i32,
+                                                      (param * 4 + i + const_index + 1) * 256,
+                                                      false);
+
+                               LLVMValueRef tmp =
+                                       ac_build_buffer_load(&ctx->ac,
+                                                            ctx->esgs_ring, 1,
+                                                            ctx->ac.i32_0,
+                                                            vtx_offset, soffset,
+                                                            0, ac_glc, true, false);
+
+                               value[i] = radv_emit_fetch_64bit(ctx, type, value[i], tmp);
+                       }
                 }
  
                 if (ac_get_type_size(type) == 2) {
@@ -1761,13 +1796,17 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr
                                        "");
  
         /* If this thread has already emitted the declared maximum number of
-        * vertices, kill it: excessive vertex emissions are not supposed to
-        * have any effect, and GS threads have no externally observable
-        * effects other than emitting vertices.
+        * vertices, don't emit any more: excessive vertex emissions are not
+        * supposed to have any effect.
          */
         can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
                                  LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
-       ac_build_kill_if_false(&ctx->ac, can_emit);
+
+       bool use_kill = !ctx->shader_info->gs.writes_memory;
+       if (use_kill)
+               ac_build_kill_if_false(&ctx->ac, can_emit);
+       else
+               ac_build_ifcc(&ctx->ac, can_emit, 6505);
  
         for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
                 unsigned output_usage_mask =
@@ -1814,6 +1853,9 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr
         ac_build_sendmsg(&ctx->ac,
                          AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
                          ctx->gs_wave_id);
+
+       if (!use_kill)
+               ac_build_endif(&ctx->ac, 6505);
  }
  
  static void
@@ -3133,6 +3175,17 @@ static void build_export_prim(struct radv_shader_context *ctx,
         ac_build_export(&ctx->ac, &args);
  }
  
+static struct radv_stream_output *
+radv_get_stream_output_by_loc(struct radv_streamout_info *so, unsigned location)
+{
+       for (unsigned i = 0; i < so->num_outputs; ++i) {
+               if (so->outputs[i].location == location)
+                       return &so->outputs[i];
+       }
+
+       return NULL;
+}
+
  static void build_streamout_vertex(struct radv_shader_context *ctx,
                                    LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw,
                                    unsigned stream, LLVMValueRef offset_vtx,
@@ -3153,23 +3206,79 @@ static void build_streamout_vertex(struct radv_shader_context *ctx,
                 offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
         }
  
-       for (unsigned i = 0; i < so->num_outputs; ++i) {
-               struct radv_stream_output *output =
-                       &ctx->shader_info->so.outputs[i];
+       if (ctx->stage == MESA_SHADER_GEOMETRY) {
+               struct radv_shader_output_values outputs[AC_LLVM_MAX_OUTPUTS];
+               unsigned noutput = 0;
+               unsigned out_idx = 0;
  
-               if (stream != output->stream)
-                       continue;
+               for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
+                       unsigned output_usage_mask =
+                               ctx->shader_info->gs.output_usage_mask[i];
+                       uint8_t output_stream =
+                               output_stream = ctx->shader_info->gs.output_streams[i];
  
-               unsigned loc = output->location;
-               struct radv_shader_output_values out = {};
+                       if (!(ctx->output_mask & (1ull << i)) ||
+                           output_stream != stream)
+                               continue;
  
-               for (unsigned comp = 0; comp < 4; comp++) {
-                       tmp = ac_build_gep0(&ctx->ac, vertexptr,
-                                           LLVMConstInt(ctx->ac.i32, 4 * loc + comp, false));
-                       out.values[comp] = LLVMBuildLoad(builder, tmp, "");
+                       outputs[noutput].slot_name = i;
+                       outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
+                       outputs[noutput].usage_mask = output_usage_mask;
+
+                       int length = util_last_bit(output_usage_mask);
+
+                       for (unsigned j = 0; j < length; j++, out_idx++) {
+                               if (!(output_usage_mask & (1 << j)))
+                                       continue;
+
+                               tmp = ac_build_gep0(&ctx->ac, vertexptr,
+                                                   LLVMConstInt(ctx->ac.i32, out_idx, false));
+                               outputs[noutput].values[j] = LLVMBuildLoad(builder, tmp, "");
+                       }
+
+                       for (unsigned j = length; j < 4; j++)
+                               outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
+
+                       noutput++;
+               }
+
+               for (unsigned i = 0; i < noutput; i++) {
+                       struct radv_stream_output *output =
+                               radv_get_stream_output_by_loc(so, outputs[i].slot_name);
+
+                       if (!output ||
+                           output->stream != stream)
+                               continue;
+
+                       struct radv_shader_output_values out = {};
+
+                       for (unsigned j = 0; j < 4; j++) {
+                               out.values[j] = outputs[i].values[j];
+                       }
+
+                       radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
                 }
+       } else {
+               for (unsigned i = 0; i < so->num_outputs; ++i) {
+                       struct radv_stream_output *output =
+                               &ctx->shader_info->so.outputs[i];
+
+                       if (stream != output->stream)
+                               continue;
+
+                       struct radv_shader_output_values out = {};
+
+                       for (unsigned comp = 0; comp < 4; comp++) {
+                               if (!(output->component_mask & (1 << comp)))
+                                       continue;
+
+                               tmp = ac_build_gep0(&ctx->ac, vertexptr,
+                                                   LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
+                               out.values[comp] = LLVMBuildLoad(builder, tmp, "");
+                       }
  
-               radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
+                       radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
+               }
         }
  }
  
@@ -3534,6 +3643,7 @@ static LLVMValueRef ngg_nogs_vertex_ptr(struct radv_shader_context *ctx,
  static void
  handle_ngg_outputs_post_1(struct radv_shader_context *ctx)
  {
+       struct radv_streamout_info *so = &ctx->shader_info->so;
         LLVMBuilderRef builder = ctx->ac.builder;
         LLVMValueRef vertex_ptr = NULL;
         LLVMValueRef tmp, tmp2;
@@ -3546,15 +3656,20 @@ handle_ngg_outputs_post_1(struct radv_shader_context *ctx)
  
         vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
  
-       for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
-               if (!(ctx->output_mask & (1ull << i)))
-                       continue;
+       for (unsigned i = 0; i < so->num_outputs; ++i) {
+               struct radv_stream_output *output =
+                       &ctx->shader_info->so.outputs[i];
+
+               unsigned loc = output->location;
+
+               for (unsigned comp = 0; comp < 4; comp++) {
+                       if (!(output->component_mask & (1 << comp)))
+                               continue;
  
-               for (unsigned j = 0; j < 4; j++) {
                         tmp = ac_build_gep0(&ctx->ac, vertex_ptr,
-                                           LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
+                                           LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
                         tmp2 = LLVMBuildLoad(builder,
-                                            ctx->abi.outputs[4 * i + j], "");
+                                            ctx->abi.outputs[4 * loc + comp], "");
                         tmp2 = ac_to_integer(&ctx->ac, tmp2);
                         LLVMBuildStore(builder, tmp2, tmp);
                 }
@@ -3587,7 +3702,11 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
         LLVMValueRef num_vertices_val;
  
         if (ctx->stage == MESA_SHADER_VERTEX) {
-               num_vertices_val = LLVMConstInt(ctx->ac.i32, 1, false);
+               LLVMValueRef outprim_val =
+                       LLVMConstInt(ctx->ac.i32,
+                                    ctx->options->key.vs.outprim, false);
+               num_vertices_val = LLVMBuildAdd(builder, outprim_val,
+                                               ctx->ac.i32_1, "");
                 num_vertices = 3; /* TODO: optimize for points & lines */
         } else {
                 assert(ctx->stage == MESA_SHADER_TESS_EVAL);
@@ -4083,7 +4202,7 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
         const LLVMValueRef can_emit =
                 LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
                               LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
-       ac_build_kill_if_false(&ctx->ac, can_emit);
+       ac_build_ifcc(&ctx->ac, can_emit, 9001);
  
         tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
         tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
@@ -4149,6 +4268,8 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
         tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
         tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
         LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
+
+       ac_build_endif(&ctx->ac, 9001);
  }
  
  static void
@@ -4648,9 +4769,13 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
         ctx.options = options;
         ctx.shader_info = shader_info;
  
-       enum ac_float_mode float_mode =
-               options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
-                                      AC_FLOAT_MODE_DEFAULT;
+       enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT;
+
+       if (shader_info->float_controls_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) {
+               float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO;
+       } else if (options->unsafe_math) {
+               float_mode = AC_FLOAT_MODE_UNSAFE_FP_MATH;
+       }
  
         ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class,
                              options->family, float_mode, options->wave_size, 64);
@@ -4697,11 +4822,9 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
             shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
                 ac_nir_fixup_ls_hs_input_vgprs(&ctx);
  
-       if (shaders[shader_count - 1]->info.stage != MESA_SHADER_GEOMETRY &&
-           (ctx.options->key.vs_common_out.as_ngg &&
-            !ctx.options->key.vs_common_out.as_es)) {
-               /* Unconditionally declare scratch space base for streamout and
-                * vertex compaction. Whether space is actually allocated is
+       if (is_ngg) {
+               /* Declare scratch space base for streamout and vertex
+                * compaction. Whether space is actually allocated is
                  * determined during linking / PM4 creation.
                  *
                  * Add an extra dword per vertex to ensure an odd stride, which
@@ -4942,7 +5065,7 @@ static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm,
                 fprintf(stderr, "\n");
         }
  
-       if (options->record_llvm_ir) {
+       if (options->record_ir) {
                 char *llvm_ir = LLVMPrintModuleToString(llvm_module);
                 llvm_ir_string = strdup(llvm_ir);
                 LLVMDisposeMessage(llvm_ir);
@@ -5035,7 +5158,7 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
                 LLVMBasicBlockRef bb;
                 unsigned offset;
  
-               if (!num_components)
+               if (stream > 0 && !num_components)
                         continue;
  
                 if (stream > 0 && !ctx->shader_info->so.num_outputs)