X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Famd%2Fllvm%2Fac_llvm_build.c;h=77d3f7e73fbe84cf49548ad95d247b14a8a04d0b;hp=f789ff5a368f650bee70a45235bb257183e70e71;hb=a79dad950b1f10ddeca2c907025a0f649b470cb9;hpb=77393cf39b7c4ae73c1c1731bddc9a0668740338

diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c
index f789ff5a368..77d3f7e73fb 100644
--- a/src/amd/llvm/ac_llvm_build.c
+++ b/src/amd/llvm/ac_llvm_build.c
@@ -65,8 +65,6 @@ ac_llvm_context_init(struct ac_llvm_context *ctx,
 		     enum ac_float_mode float_mode, unsigned wave_size,
 		     unsigned ballot_mask_bits)
 {
-	LLVMValueRef args[1];
-
 	ctx->context = LLVMContextCreate();
 
 	ctx->chip_class = chip_class;
@@ -91,6 +89,9 @@ ac_llvm_context_init(struct ac_llvm_context *ctx,
 	ctx->f32 = LLVMFloatTypeInContext(ctx->context);
 	ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
 	ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
+	ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
+	ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
+	ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
 	ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
 	ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
 	ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
@@ -127,11 +128,6 @@ ac_llvm_context_init(struct ac_llvm_context *ctx,
 	ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 							       "invariant.load", 14);
 
-	ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
-
-	args[0] = LLVMConstReal(ctx->f32, 2.5);
-	ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
-
 	ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 							"amdgpu.uniform", 14);
 
@@ -464,11 +460,10 @@ ac_build_optimization_barrier(struct ac_llvm_context *ctx,
 }
 
 LLVMValueRef
-ac_build_shader_clock(struct ac_llvm_context *ctx)
+ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope)
 {
-	const char *intr = LLVM_VERSION_MAJOR >= 9 && ctx->chip_class >= GFX8 ?
-				"llvm.amdgcn.s.memrealtime" : "llvm.readcyclecounter";
-	LLVMValueRef tmp = ac_build_intrinsic(ctx, intr, ctx->i64, NULL, 0, 0);
+	const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : "llvm.amdgcn.s.memtime";
+	LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
 	return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
 }
 
@@ -707,20 +702,25 @@ ac_build_fdiv(struct ac_llvm_context *ctx,
 	      LLVMValueRef num,
 	      LLVMValueRef den)
 {
-	/* If we do (num / den), LLVM >= 7.0 does:
-	 *    return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
-	 *
-	 * If we do (num * (1 / den)), LLVM does:
-	 *    return num * v_rcp_f32(den);
-	 */
-	LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
-	LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
-	LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
+	unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
+	const char *name;
 
-	/* Use v_rcp_f32 instead of precise division. */
-	if (!LLVMIsConstant(ret))
-		LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
-	return ret;
+	/* For doubles, we need precise division to pass GLCTS. */
+	if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL &&
+	    type_size == 8)
+		return LLVMBuildFDiv(ctx->builder, num, den, "");
+
+	if (type_size == 2)
+		name = "llvm.amdgcn.rcp.f16";
+	else if (type_size == 4)
+		name = "llvm.amdgcn.rcp.f32";
+	else
+		name = "llvm.amdgcn.rcp.f64";
+
+        LLVMValueRef rcp = ac_build_intrinsic(ctx, name, LLVMTypeOf(den),
+                                              &den, 1, AC_FUNC_ATTR_READNONE);
+
+	return LLVMBuildFMul(ctx->builder, num, rcp, "");
 }
 
 /* See fast_idiv_by_const.h. */
@@ -1188,8 +1188,6 @@ ac_build_buffer_store_common(struct ac_llvm_context *ctx,
 			     LLVMValueRef vindex,
 			     LLVMValueRef voffset,
 			     LLVMValueRef soffset,
-			     unsigned num_channels,
-			     LLVMTypeRef return_channel_type,
 			     unsigned cache_policy,
 			     bool use_format,
 			     bool structurized)
@@ -1203,12 +1201,10 @@ ac_build_buffer_store_common(struct ac_llvm_context *ctx,
 	args[idx++] = voffset ? voffset : ctx->i32_0;
 	args[idx++] = soffset ? soffset : ctx->i32_0;
 	args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
-	unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
 	const char *indexing_kind = structurized ? "struct" : "raw";
 	char name[256], type_name[8];
 
-	LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type;
-	ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
+	ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
 
 	if (use_format) {
 		snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s",
@@ -1228,13 +1224,10 @@ ac_build_buffer_store_format(struct ac_llvm_context *ctx,
 			     LLVMValueRef data,
 			     LLVMValueRef vindex,
 			     LLVMValueRef voffset,
-			     unsigned num_channels,
 			     unsigned cache_policy)
 {
-	ac_build_buffer_store_common(ctx, rsrc, data, vindex,
-				     voffset, NULL, num_channels,
-				     ctx->f32, cache_policy,
-				     true, true);
+	ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL,
+				     cache_policy, true, true);
 }
 
 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
@@ -1283,7 +1276,6 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
 
 		ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata),
 					     ctx->i32_0, voffset, offset,
-					     num_channels, ctx->f32,
 					     cache_policy, false, false);
 		return;
 	}
@@ -1327,6 +1319,11 @@ ac_build_buffer_load_common(struct ac_llvm_context *ctx,
 	const char *indexing_kind = structurized ? "struct" : "raw";
 	char name[256], type_name[8];
 
+	/* D16 is only supported on gfx8+ */
+	assert(!use_format ||
+	       (channel_type != ctx->f16 && channel_type != ctx->i16) ||
+	       ctx->chip_class >= GFX8);
+
 	LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
 	ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
 
@@ -1402,10 +1399,12 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
 					 LLVMValueRef voffset,
 					 unsigned num_channels,
 					 unsigned cache_policy,
-					 bool can_speculate)
+					 bool can_speculate,
+					 bool d16)
 {
 	return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
-					   ctx->i32_0, num_channels, ctx->f32,
+					   ctx->i32_0, num_channels,
+					   d16 ? ctx->f16 : ctx->f32,
 					   cache_policy, can_speculate,
 					   true, true);
 }
@@ -1657,7 +1656,7 @@ ac_build_opencoded_load_format(struct ac_llvm_context *ctx,
 	}
 
 	int log_recombine = 0;
-	if (ctx->chip_class == GFX6 && !known_aligned) {
+	if ((ctx->chip_class == GFX6 || ctx->chip_class == GFX10) && !known_aligned) {
 		/* Avoid alignment restrictions by loading one byte at a time. */
 		load_num_channels <<= load_log_size;
 		log_recombine = load_log_size;
@@ -1941,8 +1940,7 @@ ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
 	if (LLVM_VERSION_MAJOR >= 9) {
 		/* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
 		ac_build_buffer_store_common(ctx, rsrc, vdata, NULL,
-					     voffset, soffset, 1,
-					     ctx->i16, cache_policy,
+					     voffset, soffset, cache_policy,
 					     false, false);
 	} else {
 		unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
@@ -1968,8 +1966,7 @@ ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
 	if (LLVM_VERSION_MAJOR >= 9) {
 		/* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
 		ac_build_buffer_store_common(ctx, rsrc, vdata, NULL,
-					     voffset, soffset, 1,
-					     ctx->i8, cache_policy,
+					     voffset, soffset, cache_policy,
 					     false, false);
 	} else {
 		unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
@@ -2064,6 +2061,8 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
 
 	if (result_type == ctx->f16)
 		val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
+	else if (result_type == ctx->v2f16)
+		val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
 
 	for (unsigned i = 0; i < 4; ++i) {
 		tl_lanes[i] = i & mask;
@@ -2197,8 +2196,10 @@ ac_build_umsb(struct ac_llvm_context *ctx,
 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
 			   LLVMValueRef b)
 {
-	char name[64];
-	snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
+	char name[64], type[64];
+
+	ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
+	snprintf(name, sizeof(name), "llvm.minnum.%s", type);
 	LLVMValueRef args[2] = {a, b};
 	return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
 				  AC_FUNC_ATTR_READNONE);
@@ -2207,8 +2208,10 @@ LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
 			   LLVMValueRef b)
 {
-	char name[64];
-	snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
+	char name[64], type[64];
+
+	ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
+	snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
 	LLVMValueRef args[2] = {a, b};
 	return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
 				  AC_FUNC_ATTR_READNONE);
@@ -2257,13 +2260,10 @@ void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
 	args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
 
 	if (a->compr) {
-		LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
-		LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
-
 		args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
-				v2i16, "");
+				ctx->v2i16, "");
 		args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
-				v2i16, "");
+				ctx->v2i16, "");
 		args[4] = LLVMConstInt(ctx->i1, a->done, 0);
 		args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
 
@@ -2380,6 +2380,14 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
 	       (a->lod ? 1 : 0) +
 	       (a->level_zero ? 1 : 0) +
 	       (a->derivs[0] ? 1 : 0) <= 1);
+	assert((a->min_lod ? 1 : 0) +
+	       (a->lod ? 1 : 0) +
+	       (a->level_zero ? 1 : 0) <= 1);
+	assert(!a->d16 || (ctx->chip_class >= GFX8 &&
+			   a->opcode != ac_image_atomic &&
+			   a->opcode != ac_image_atomic_cmpswap &&
+			   a->opcode != ac_image_get_lod &&
+			   a->opcode != ac_image_get_resinfo));
 
 	if (a->opcode == ac_image_get_lod) {
 		switch (dim) {
@@ -2435,6 +2443,9 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
 		args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
 	if (a->lod)
 		args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
+	if (a->min_lod)
+		args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
+
 	overload[num_overloads++] = sample ? ".f32" : ".i32";
 
 	args[num_args++] = a->resource;
@@ -2488,7 +2499,7 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
 	char intr_name[96];
 	snprintf(intr_name, sizeof(intr_name),
 		 "llvm.amdgcn.image.%s%s" /* base name */
-		 "%s%s%s" /* sample/gather modifiers */
+		 "%s%s%s%s" /* sample/gather modifiers */
 		 ".%s.%s%s%s%s", /* dimension and type overloads */
 		 name, atomic_subop,
 		 a->compare ? ".c" : "",
@@ -2496,9 +2507,10 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
 		 lod_suffix ? ".l" :
 		 a->derivs[0] ? ".d" :
 		 a->level_zero ? ".lz" : "",
+		 a->min_lod ? ".cl" : "",
 		 a->offset ? ".o" : "",
 		 dimname,
-		 atomic ? "i32" : "v4f32",
+		 atomic ? "i32" : (a->d16 ? "v4f16" : "v4f32"),
 		 overload[0], overload[1], overload[2]);
 
 	LLVMTypeRef retty;
@@ -2507,15 +2519,14 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
 	else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
 		retty = ctx->voidt;
 	else
-		retty = ctx->v4f32;
+		retty = a->d16 ? ctx->v4f16 : ctx->v4f32;
 
 	LLVMValueRef result =
 		ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
 				   a->attributes);
-	if (!sample && retty == ctx->v4f32) {
-		result = LLVMBuildBitCast(ctx->builder, result,
-					  ctx->v4i32, "");
-	}
+	if (!sample && !atomic && retty != ctx->voidt)
+		result = ac_to_integer(ctx, result);
+
 	return result;
 }
 
@@ -2541,10 +2552,7 @@ LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx,
 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
 				    LLVMValueRef args[2])
 {
-	LLVMTypeRef v2f16 =
-		LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
-
-	return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
+	return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16,
 				  args, 2, AC_FUNC_ATTR_READNONE);
 }
 
@@ -2719,33 +2727,6 @@ void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
 			   ctx->voidt, args, 1, 0);
 }
 
-LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0,
-			    LLVMValueRef src1, LLVMValueRef src2,
-			    unsigned bitsize)
-{
-	LLVMTypeRef type;
-	char *intr;
-
-	if (bitsize == 16) {
-		intr = "llvm.amdgcn.fmed3.f16";
-		type = ctx->f16;
-	} else if (bitsize == 32) {
-		intr = "llvm.amdgcn.fmed3.f32";
-		type = ctx->f32;
-	} else {
-		intr = "llvm.amdgcn.fmed3.f64";
-		type = ctx->f64;
-	}
-
-	LLVMValueRef params[] = {
-		src0,
-		src1,
-		src2,
-	};
-	return ac_build_intrinsic(ctx, intr, type, params, 3,
-				  AC_FUNC_ATTR_READNONE);
-}
-
 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
 			    unsigned bitsize)
 {
@@ -3067,6 +3048,7 @@ void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
 			    LLVMValueRef main_fn,
 			    uint8_t *vs_output_param_offset,
 			    uint32_t num_outputs,
+			    uint32_t skip_output_mask,
 			    uint8_t *num_param_exports)
 {
 	LLVMBasicBlockRef bb;
@@ -3133,12 +3115,13 @@ void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
 			}
 
 			/* Eliminate constant and duplicated PARAM exports. */
-			if (ac_eliminate_const_output(vs_output_param_offset,
-						      num_outputs, &exp) ||
-			    ac_eliminate_duplicated_output(ctx,
-							   vs_output_param_offset,
-							   num_outputs, &exports,
-							   &exp)) {
+			if (!((1u << target) & skip_output_mask) &&
+                            (ac_eliminate_const_output(vs_output_param_offset,
+						       num_outputs, &exp) ||
+			     ac_eliminate_duplicated_output(ctx,
+							    vs_output_param_offset,
+							    num_outputs, &exports,
+							    &exp))) {
 				removed_any = true;
 			} else {
 				exports.exp[exports.num++] = exp;
@@ -3590,12 +3573,14 @@ void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
 }
 
 static LLVMValueRef
-_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
+_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
+		  LLVMValueRef lane, bool with_opt_barrier)
 {
 	LLVMTypeRef type = LLVMTypeOf(src);
 	LLVMValueRef result;
 
-	ac_build_optimization_barrier(ctx, &src);
+	if (with_opt_barrier)
+		ac_build_optimization_barrier(ctx, &src);
 
 	src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
 	if (lane)
@@ -3611,15 +3596,10 @@ _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef l
 	return LLVMBuildTrunc(ctx->builder, result, type, "");
 }
 
-/**
- * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
- * @param ctx
- * @param src
- * @param lane - id of the lane or NULL for the first active lane
- * @return value of the lane
- */
-LLVMValueRef
-ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
+static LLVMValueRef
+ac_build_readlane_common(struct ac_llvm_context *ctx,
+			 LLVMValueRef src, LLVMValueRef lane,
+			 bool with_opt_barrier)
 {
 	LLVMTypeRef src_type = LLVMTypeOf(src);
 	src = ac_to_integer(ctx, src);
@@ -3633,14 +3613,19 @@ ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef la
 			LLVMBuildBitCast(ctx->builder, src, vec_type, "");
 		ret = LLVMGetUndef(vec_type);
 		for (unsigned i = 0; i < bits / 32; i++) {
+			LLVMValueRef ret_comp;
+
 			src = LLVMBuildExtractElement(ctx->builder, src_vector,
 						LLVMConstInt(ctx->i32, i, 0), "");
-			LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
+
+			ret_comp = _ac_build_readlane(ctx, src, lane,
+						      with_opt_barrier);
+
 			ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
 						LLVMConstInt(ctx->i32, i, 0), "");
 		}
 	} else {
-		ret = _ac_build_readlane(ctx, src, lane);
+		ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
 	}
 
 	if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
@@ -3648,6 +3633,30 @@ ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef la
 	return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
 }
 
+/**
+ * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
+ *
+ * The optimization barrier is not needed if the value is the same in all lanes
+ * or if this is called in the outermost block.
+ *
+ * @param ctx
+ * @param src
+ * @param lane - id of the lane or NULL for the first active lane
+ * @return value of the lane
+ */
+LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx,
+                                             LLVMValueRef src, LLVMValueRef lane)
+{
+	return ac_build_readlane_common(ctx, src, lane, false);
+}
+
+
+LLVMValueRef
+ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
+{
+	return ac_build_readlane_common(ctx, src, lane, true);
+}
+
 LLVMValueRef
 ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
 {
@@ -3664,9 +3673,7 @@ ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
 					  (LLVMValueRef []) { mask, ctx->i32_0 },
 					  2, AC_FUNC_ATTR_READNONE);
 	}
-	LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
-						 LLVMVectorType(ctx->i32, 2),
-						 "");
+	LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
 	LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
 						       ctx->i32_0, "");
 	LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
@@ -4663,6 +4670,24 @@ ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
 	return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
 }
 
+LLVMValueRef
+ac_build_is_helper_invocation(struct ac_llvm_context *ctx)
+{
+	if (!ctx->postponed_kill)
+		return ac_build_load_helper_invocation(ctx);
+
+	/* !(exact && postponed) */
+	LLVMValueRef exact = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live",
+						ctx->i1, NULL, 0,
+						AC_FUNC_ATTR_READNONE);
+
+	LLVMValueRef postponed = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, "");
+	LLVMValueRef result = LLVMBuildAnd(ctx->builder, exact, postponed, "");
+
+	return LLVMBuildSelect(ctx->builder, result, ctx->i32_0,
+	                       LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), "");
+}
+
 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func,
 			   LLVMValueRef *args, unsigned num_args)
 {
@@ -4750,6 +4775,18 @@ void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wav
 {
 	LLVMBuilderRef builder = ctx->builder;
 	LLVMValueRef tmp;
+	bool export_dummy_prim = false;
+
+	/* HW workaround for a GPU hang with 100% culling.
+	 * We always have to export at least 1 primitive.
+	 * Export a degenerate triangle using vertex 0 for all 3 vertices.
+	 */
+	if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) {
+		assert(vtx_cnt == ctx->i32_0);
+		prim_cnt = ctx->i32_1;
+		vtx_cnt = ctx->i32_1;
+		export_dummy_prim = true;
+	}
 
 	ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);
 
@@ -4757,6 +4794,24 @@ void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wav
 	tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
 	ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);
 
+	if (export_dummy_prim) {
+		struct ac_ngg_prim prim = {};
+		/* The vertex indices are 0,0,0. */
+		prim.passthrough = ctx->i32_0;
+
+		struct ac_export_args pos = {};
+		pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = ctx->f32_0;
+		pos.target = V_008DFC_SQ_EXP_POS;
+		pos.enabled_channels = 0xf;
+		pos.done = true;
+
+		ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx),
+						 ctx->i32_0, ""), 5021);
+		ac_build_export_prim(ctx, &prim);
+		ac_build_export(ctx, &pos);
+		ac_build_endif(ctx, 5021);
+	}
+
 	ac_build_endif(ctx, 5020);
 }
 
@@ -4885,10 +4940,20 @@ ac_build_main(const struct ac_shader_args *args,
 		if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
 			ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
 			ac_add_attr_dereferenceable(P, UINT64_MAX);
+			ac_add_attr_alignment(P, 32);
 		}
 	}
 
 	ctx->main_function = main_function;
+
+	if (LLVM_VERSION_MAJOR >= 11) {
+		/* Enable denormals for FP16 and FP64: */
+		LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math",
+						   "ieee,ieee");
+		/* Disable denormals for FP32: */
+		LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
+						   "preserve-sign,preserve-sign");
+	}
 	return main_function;
 }
 
@@ -4957,3 +5022,38 @@ LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx,
 	return LLVMBuildAdd(builder, prefix_bcnt[0], prefix_bcnt[1], "");
 #endif
 }
+
+/**
+ * Convert triangle strip indices to triangle indices. This is used to decompose
+ * triangle strips into triangles.
+ */
+void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx,
+						 LLVMValueRef is_odd,
+						 LLVMValueRef flatshade_first,
+						 LLVMValueRef index[3])
+{
+	LLVMBuilderRef builder = ctx->builder;
+	LLVMValueRef out[3];
+
+	/* We need to change the vertex order for odd triangles to get correct
+	 * front/back facing by swapping 2 vertex indices, but we also have to
+	 * keep the provoking vertex in the same place.
+	 *
+	 * If the first vertex is provoking, swap index 1 and 2.
+	 * If the last vertex is provoking, swap index 0 and 1.
+	 */
+	out[0] = LLVMBuildSelect(builder, flatshade_first,
+				 index[0],
+				 LLVMBuildSelect(builder, is_odd,
+						 index[1], index[0], ""), "");
+	out[1] = LLVMBuildSelect(builder, flatshade_first,
+				 LLVMBuildSelect(builder, is_odd,
+						 index[2], index[1], ""),
+				 LLVMBuildSelect(builder, is_odd,
+						 index[0], index[1], ""), "");
+	out[2] = LLVMBuildSelect(builder, flatshade_first,
+				 LLVMBuildSelect(builder, is_odd,
+						 index[1], index[2], ""),
+				 index[2], "");
+	memcpy(index, out, sizeof(out));
+}