X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_nir_to_llvm.c;h=db21ad809b784e424da9b5839300c2979ce65210;hb=2182bbf84f0f19846a47f0438ec702f4d862731e;hp=d833bc2477dc5b945d5f28f8742898782cfe241b;hpb=2d295ab3f35acd796826d6f06f798d8618b1d814;p=mesa.git

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index d833bc2477d..db21ad809b7 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -589,11 +589,12 @@ store_tcs_output(struct ac_shader_abi *abi,
 		 LLVMValueRef param_index,
 		 unsigned const_index,
 		 LLVMValueRef src,
-		 unsigned writemask)
+		 unsigned writemask,
+		 unsigned component,
+		 unsigned driver_location)
 {
 	struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
 	const unsigned location = var->data.location;
-	unsigned component = var->data.location_frac;
 	const bool is_patch = var->data.patch;
 	const bool is_compact = var->data.compact;
 	LLVMValueRef dw_addr;
@@ -876,39 +877,21 @@ static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
 
 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
 				     unsigned stream,
+				     LLVMValueRef vertexidx,
 				     LLVMValueRef *addrs);
 
 static void
-visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
+visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream,
+			       LLVMValueRef vertexidx, LLVMValueRef *addrs)
 {
-	LLVMValueRef gs_next_vertex;
-	LLVMValueRef can_emit;
 	unsigned offset = 0;
 	struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
 
 	if (ctx->args->options->key.vs_common_out.as_ngg) {
-		gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
+		gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs);
 		return;
 	}
 
-	/* Write vertex attribute values to GSVS ring */
-	gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
-				       ctx->gs_next_vertex[stream],
-				       "");
-
-	/* If this thread has already emitted the declared maximum number of
-	 * vertices, don't emit any more: excessive vertex emissions are not
-	 * supposed to have any effect.
-	 */
-	can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
-				 LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
-
-	bool use_kill = !ctx->args->shader_info->gs.writes_memory;
-	if (use_kill)
-		ac_build_kill_if_false(&ctx->ac, can_emit);
-	else
-		ac_build_ifcc(&ctx->ac, can_emit, 6505);
-
 	for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
 		unsigned output_usage_mask =
 			ctx->args->shader_info->gs.output_usage_mask[i];
@@ -933,7 +916,7 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr
 
 			offset++;
 
-			voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
+			voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, "");
 			voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
 
 			out_val = ac_to_integer(&ctx->ac, out_val);
@@ -949,16 +932,9 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr
 		}
 	}
 
-	gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex,
-				      ctx->ac.i32_1, "");
-	LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
-
 	ac_build_sendmsg(&ctx->ac,
 			 AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
 			 ctx->gs_wave_id);
-
-	if (!use_kill)
-		ac_build_endif(&ctx->ac, 6505);
 }
 
 static void
@@ -1296,7 +1272,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 		 * access are detected. Only GFX6 and GFX10 are affected.
 		 */
 		bool unaligned_vertex_fetches = false;
-		if ((ctx->ac.chip_class == GFX6 || ctx->ac.chip_class == GFX10) &&
+		if ((ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10) &&
 		    vtx_info->chan_format != data_format &&
 		    ((attrib_offset % vtx_info->element_size) ||
 		     (attrib_stride % vtx_info->element_size)))
@@ -1307,7 +1283,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 			LLVMValueRef values[4];
 
 			assert(ctx->ac.chip_class == GFX6 ||
-			       ctx->ac.chip_class == GFX10);
+			       ctx->ac.chip_class >= GFX10);
 
 			for (unsigned chan  = 0; chan < num_channels; chan++) {
 				unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size;
@@ -1392,7 +1368,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 static void
 handle_vs_inputs(struct radv_shader_context *ctx,
                  struct nir_shader *nir) {
-	nir_foreach_variable(variable, &nir->inputs)
+	nir_foreach_shader_in_variable(variable, nir)
 		handle_vs_input_decl(ctx, variable);
 }
 
@@ -1402,7 +1378,7 @@ prepare_interp_optimize(struct radv_shader_context *ctx,
 {
 	bool uses_center = false;
 	bool uses_centroid = false;
-	nir_foreach_variable(variable, &nir->inputs) {
+	nir_foreach_shader_in_variable(variable, nir) {
 		if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
 		    variable->data.sample)
 			continue;
@@ -1578,6 +1554,30 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
 			break;
 		}
 
+		/* Replace NaN by zero (only 32-bit) to fix game bugs if
+		 * requested.
+		 */
+		if (ctx->args->options->enable_mrt_output_nan_fixup &&
+		    !is_16bit &&
+		    (col_format == V_028714_SPI_SHADER_32_R ||
+		     col_format == V_028714_SPI_SHADER_32_GR ||
+		     col_format == V_028714_SPI_SHADER_32_AR ||
+		     col_format == V_028714_SPI_SHADER_32_ABGR ||
+		     col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
+			for (unsigned i = 0; i < 4; i++) {
+				LLVMValueRef args[2] = {
+					values[i],
+					LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN, false)
+				};
+				LLVMValueRef isnan =
+					ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
+		                                           args, 2, AC_FUNC_ATTR_READNONE);
+				values[i] = LLVMBuildSelect(ctx->ac.builder, isnan,
+							    ctx->ac.f32_0,
+							    values[i], "");
+			}
+		}
+
 		/* Pack f16 or norm_i16/u16. */
 		if (packf) {
 			for (chan = 0; chan < 2; chan++) {
@@ -1797,6 +1797,7 @@ radv_build_param_exports(struct radv_shader_context *ctx,
 
 		if (slot_name != VARYING_SLOT_LAYER &&
 		    slot_name != VARYING_SLOT_PRIMITIVE_ID &&
+		    slot_name != VARYING_SLOT_VIEWPORT &&
 		    slot_name != VARYING_SLOT_CLIP_DIST0 &&
 		    slot_name != VARYING_SLOT_CLIP_DIST1 &&
 		    slot_name < VARYING_SLOT_VAR0)
@@ -1916,12 +1917,10 @@ radv_llvm_export_vs(struct radv_shader_context *ctx,
 			outinfo->pos_exports++;
 	}
 
-	/* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
+	/* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
 	 * Setting valid_mask=1 prevents it and has no other effect.
 	 */
-	if (ctx->ac.family == CHIP_NAVI10 ||
-	    ctx->ac.family == CHIP_NAVI12 ||
-	    ctx->ac.family == CHIP_NAVI14)
+	if (ctx->ac.chip_class == GFX10)
 		pos_args[0].valid_mask = 1;
 
 	pos_idx = 0;
@@ -2318,8 +2317,7 @@ static void build_streamout_vertex(struct radv_shader_context *ctx,
 		for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
 			unsigned output_usage_mask =
 				ctx->args->shader_info->gs.output_usage_mask[i];
-			uint8_t output_stream =
-				output_stream = ctx->args->shader_info->gs.output_streams[i];
+			uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
 
 			if (!(ctx->output_mask & (1ull << i)) ||
 			    output_stream != stream)
@@ -3309,25 +3307,11 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
 
 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
 				     unsigned stream,
+				     LLVMValueRef vertexidx,
 				     LLVMValueRef *addrs)
 {
 	LLVMBuilderRef builder = ctx->ac.builder;
 	LLVMValueRef tmp;
-	const LLVMValueRef vertexidx =
-		LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
-
-	/* If this thread has already emitted the declared maximum number of
-	 * vertices, skip the write: excessive vertex emissions are not
-	 * supposed to have any effect.
-	 */
-	const LLVMValueRef can_emit =
-		LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
-			      LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
-	ac_build_ifcc(&ctx->ac, can_emit, 9001);
-
-	tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
-	tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
-	LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
 
 	const LLVMValueRef vertexptr =
 		ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
@@ -3359,6 +3343,13 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
 	}
 	assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size);
 
+	/* Store the current number of emitted vertices to zero out remaining
+	 * primitive flags in case the geometry shader doesn't emit the maximum
+	 * number of vertices.
+	 */
+	tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
+	LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
+
 	/* Determine and store whether this vertex completed a primitive. */
 	const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
 
@@ -3395,8 +3386,6 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
 	tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
 	tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
 	LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
-
-	ac_build_endif(&ctx->ac, 9001);
 }
 
 static void
@@ -3738,7 +3727,7 @@ ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx)
 	ac_optimize_vs_outputs(&ctx->ac,
 			       ctx->main_function,
 			       outinfo->vs_output_param_offset,
-			       VARYING_SLOT_MAX,
+			       VARYING_SLOT_MAX, 0,
 			       &outinfo->param_exports);
 }
 
@@ -3747,7 +3736,7 @@ ac_setup_rings(struct radv_shader_context *ctx)
 {
 	if (ctx->args->options->chip_class <= GFX8 &&
 	    (ctx->stage == MESA_SHADER_GEOMETRY ||
-	     ctx->args->options->key.vs_common_out.as_es || ctx->args->options->key.vs_common_out.as_es)) {
+	     ctx->args->options->key.vs_common_out.as_es)) {
 		unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS
 								   : RING_ESGS_VS;
 		LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
@@ -3925,7 +3914,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 
 	ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class,
 			     args->options->family, float_mode,
-			     args->shader_info->wave_size, 64);
+			     args->shader_info->wave_size,
+			     args->shader_info->ballot_bit_size);
 	ctx.context = ctx.ac.context;
 
 	ctx.max_workgroup_size = 0;
@@ -3947,7 +3937,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 
 	ctx.abi.inputs = &ctx.inputs[0];
 	ctx.abi.emit_outputs = handle_shader_outputs_post;
-	ctx.abi.emit_vertex = visit_emit_vertex;
+	ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter;
 	ctx.abi.load_ubo = radv_load_ubo;
 	ctx.abi.load_ssbo = radv_load_ssbo;
 	ctx.abi.load_sampler_desc = radv_get_sampler_desc;
@@ -4038,13 +4028,15 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 				ctx.tcs_num_inputs = args->options->key.tcs.num_inputs;
 			else
 				ctx.tcs_num_inputs = util_last_bit64(args->shader_info->vs.ls_outputs_written);
+			unsigned tcs_num_outputs = util_last_bit64(ctx.args->shader_info->tcs.outputs_written);
+			unsigned tcs_num_patch_outputs = util_last_bit64(ctx.args->shader_info->tcs.patch_outputs_written);
 			ctx.tcs_num_patches =
 				get_tcs_num_patches(
 					ctx.args->options->key.tcs.input_vertices,
 					ctx.shader->info.tess.tcs_vertices_out,
 					ctx.tcs_num_inputs,
-					ctx.args->shader_info->tcs.outputs_written,
-					ctx.args->shader_info->tcs.patch_outputs_written,
+					tcs_num_outputs,
+					tcs_num_patch_outputs,
 					ctx.args->options->tess_offchip_block_dw_size,
 					ctx.args->options->chip_class,
 					ctx.args->options->family);
@@ -4099,7 +4091,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 			ac_emit_barrier(&ctx.ac, ctx.stage);
 		}
 
-		nir_foreach_variable(variable, &shaders[i]->outputs)
+		nir_foreach_shader_out_variable(variable, shaders[i])
 			scan_shader_output_decl(&ctx, variable, shaders[i], shaders[i]->info.stage);
 
 		ac_setup_rings(&ctx);
@@ -4148,15 +4140,18 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 		}
 
 		if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) {
+			unsigned tcs_num_outputs = util_last_bit64(ctx.args->shader_info->tcs.outputs_written);
+			unsigned tcs_num_patch_outputs = util_last_bit64(ctx.args->shader_info->tcs.patch_outputs_written);
 			args->shader_info->tcs.num_patches = ctx.tcs_num_patches;
-			args->shader_info->tcs.lds_size =
+			args->shader_info->tcs.num_lds_blocks =
 				calculate_tess_lds_size(
+					ctx.args->options->chip_class,
 					ctx.args->options->key.tcs.input_vertices,
 					ctx.shader->info.tess.tcs_vertices_out,
 					ctx.tcs_num_inputs,
 					ctx.tcs_num_patches,
-					ctx.args->shader_info->tcs.outputs_written,
-					ctx.args->shader_info->tcs.patch_outputs_written);
+					tcs_num_outputs,
+					tcs_num_patch_outputs);
 		}
 	}
 
@@ -4415,7 +4410,7 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
 
 	ac_setup_rings(&ctx);
 
-	nir_foreach_variable(variable, &geom_shader->outputs) {
+	nir_foreach_shader_out_variable(variable, geom_shader) {
 		scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
 		ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader,
 					     variable, MESA_SHADER_VERTEX);
@@ -4447,8 +4442,6 @@ llvm_compile_shader(struct radv_device *device,
 	tm_options |= AC_TM_SUPPORTS_SPILL;
 	if (args->options->check_ir)
 		tm_options |= AC_TM_CHECK_IR;
-	if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT)
-		tm_options |= AC_TM_NO_LOAD_STORE_OPT;
 
 	thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM);