X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fllvm%2Fac_nir_to_llvm.c;h=4537ec7126774efcd192f24c7448fc64c0a058b7;hb=286795803c94f14bb0a11366dc1b6f8e497cd8df;hp=4ae45c6204d1716e371926ac8ccac648d6e6dc13;hpb=d3737002eed280ac6362a88f29d6c30d66a1e852;p=mesa.git

diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index 4ae45c6204d..4537ec71267 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -51,6 +51,7 @@ struct ac_nir_context {
 	struct hash_table *defs;
 	struct hash_table *phis;
 	struct hash_table *vars;
+        struct hash_table *verified_interp;
 
 	LLVMValueRef main_function;
 	LLVMBasicBlockRef continue_block;
@@ -60,10 +61,16 @@ struct ac_nir_context {
 	LLVMValueRef *locals;
 };
 
+static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx,
+					   nir_deref_instr *deref_instr,
+					   const nir_instr *instr,
+					   bool image);
+
 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
 				     nir_deref_instr *deref_instr,
 				     enum ac_descriptor_type desc_type,
 				     const nir_instr *instr,
+				     LLVMValueRef index,
 				     bool image, bool write);
 
 static void
@@ -163,6 +170,17 @@ static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx,
                                  LLVMIntPredicate pred, LLVMValueRef src0,
                                  LLVMValueRef src1)
 {
+	LLVMTypeRef src0_type = LLVMTypeOf(src0);
+	LLVMTypeRef src1_type = LLVMTypeOf(src1);
+
+	if (LLVMGetTypeKind(src0_type) == LLVMPointerTypeKind &&
+	    LLVMGetTypeKind(src1_type) != LLVMPointerTypeKind) {
+		src1 = LLVMBuildIntToPtr(ctx->builder, src1, src0_type, "");
+	} else if (LLVMGetTypeKind(src1_type) == LLVMPointerTypeKind &&
+		   LLVMGetTypeKind(src0_type) != LLVMPointerTypeKind) {
+		src0 = LLVMBuildIntToPtr(ctx->builder, src0, src1_type, "");
+	}
+
 	LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, "");
 	return LLVMBuildSelect(ctx->builder, result,
 	                       LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
@@ -187,13 +205,13 @@ static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx,
 					 LLVMTypeRef result_type,
 					 LLVMValueRef src0)
 {
-	char name[64];
+	char name[64], type[64];
 	LLVMValueRef params[] = {
 		ac_to_float(ctx, src0),
 	};
 
-	ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
-						 ac_get_elem_bits(ctx, result_type));
+	ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+	ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
 	assert(length < sizeof(name));
 	return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE);
 }
@@ -203,14 +221,14 @@ static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx,
 				       LLVMTypeRef result_type,
 				       LLVMValueRef src0, LLVMValueRef src1)
 {
-	char name[64];
+	char name[64], type[64];
 	LLVMValueRef params[] = {
 		ac_to_float(ctx, src0),
 		ac_to_float(ctx, src1),
 	};
 
-	ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
-						 ac_get_elem_bits(ctx, result_type));
+	ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+	ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
 	assert(length < sizeof(name));
 	return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE);
 }
@@ -220,15 +238,15 @@ static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx,
 					 LLVMTypeRef result_type,
 					 LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
 {
-	char name[64];
+	char name[64], type[64];
 	LLVMValueRef params[] = {
 		ac_to_float(ctx, src0),
 		ac_to_float(ctx, src1),
 		ac_to_float(ctx, src2),
 	};
 
-	ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
-						 ac_get_elem_bits(ctx, result_type));
+	ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+	ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
 	assert(length < sizeof(name));
 	return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE);
 }
@@ -490,6 +508,93 @@ static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx,
 	return result;
 }
 
+struct waterfall_context {
+	LLVMBasicBlockRef phi_bb[2];
+	bool use_waterfall;
+};
+
+/* To deal with divergent descriptors we can create a loop that handles all
+ * lanes with the same descriptor on a given iteration (henceforth a
+ * waterfall loop).
+ *
+ * These helper create the begin and end of the loop leaving the caller
+ * to implement the body.
+ * 
+ * params:
+ *  - ctx is the usal nir context
+ *  - wctx is a temporary struct containing some loop info. Can be left uninitialized.
+ *  - value is the possibly divergent value for which we built the loop
+ *  - divergent is whether value is actually divergent. If false we just pass
+ *     things through.
+ */
+static LLVMValueRef enter_waterfall(struct ac_nir_context *ctx,
+				    struct waterfall_context *wctx,
+				    LLVMValueRef value, bool divergent)
+{
+	/* If the app claims the value is divergent but it is constant we can
+	 * end up with a dynamic index of NULL. */
+	if (!value)
+		divergent = false;
+
+	wctx->use_waterfall = divergent;
+	if (!divergent)
+		return value;
+
+	ac_build_bgnloop(&ctx->ac, 6000);
+
+	LLVMValueRef scalar_value = ac_build_readlane(&ctx->ac, value, NULL);
+
+	LLVMValueRef active = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, value,
+					    scalar_value, "uniform_active");
+
+	wctx->phi_bb[0] = LLVMGetInsertBlock(ctx->ac.builder);
+	ac_build_ifcc(&ctx->ac, active, 6001);
+
+	return scalar_value;
+}
+
+static LLVMValueRef exit_waterfall(struct ac_nir_context *ctx,
+				   struct waterfall_context *wctx,
+				   LLVMValueRef value)
+{
+	LLVMValueRef ret = NULL;
+	LLVMValueRef phi_src[2];
+	LLVMValueRef cc_phi_src[2] = {
+		LLVMConstInt(ctx->ac.i32, 0, false),
+		LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
+	};
+
+	if (!wctx->use_waterfall)
+		return value;
+
+	wctx->phi_bb[1] = LLVMGetInsertBlock(ctx->ac.builder);
+
+	ac_build_endif(&ctx->ac, 6001);
+
+	if (value) {
+		phi_src[0] = LLVMGetUndef(LLVMTypeOf(value));
+		phi_src[1] = value;
+
+		ret = ac_build_phi(&ctx->ac, LLVMTypeOf(value), 2, phi_src, wctx->phi_bb);
+	}
+
+	/*
+	 * By using the optimization barrier on the exit decision, we decouple
+	 * the operations from the break, and hence avoid LLVM hoisting the
+	 * opteration into the break block.
+	 */
+	LLVMValueRef cc = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, cc_phi_src, wctx->phi_bb);
+	ac_build_optimization_barrier(&ctx->ac, &cc);
+
+	LLVMValueRef active = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, cc, ctx->ac.i32_0, "uniform_active2");
+	ac_build_ifcc(&ctx->ac, active, 6002);
+	ac_build_break(&ctx->ac);
+	ac_build_endif(&ctx->ac, 6002);
+
+	ac_build_endloop(&ctx->ac, 6000);
+	return ret;
+}
+
 static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 {
 	LLVMValueRef src[4], result = NULL;
@@ -595,8 +700,18 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], "");
 		break;
 	case nir_op_frcp:
-		src[0] = ac_to_float(&ctx->ac, src[0]);
-		result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(src[0]), 1.0), src[0]);
+		/* For doubles, we need precise division to pass GLCTS. */
+		if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL &&
+		    ac_get_type_size(def_type) == 8) {
+			result = LLVMBuildFDiv(ctx->ac.builder, ctx->ac.f64_1,
+					       ac_to_float(&ctx->ac, src[0]), "");
+		} else {
+			result = emit_intrin_1f_param(&ctx->ac, "llvm.amdgcn.rcp",
+						      ac_to_float_type(&ctx->ac, def_type), src[0]);
+		}
+		if (ctx->abi->clamp_div_by_zero)
+			result = ac_build_fmin(&ctx->ac, result,
+					       LLVMConstReal(ac_to_float_type(&ctx->ac, def_type), FLT_MAX));
 		break;
 	case nir_op_iand:
 		result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
@@ -655,7 +770,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 	case nir_op_feq32:
 		result = emit_float_cmp(&ctx->ac, LLVMRealOEQ, src[0], src[1]);
 		break;
-	case nir_op_fne32:
+	case nir_op_fneu32:
 		result = emit_float_cmp(&ctx->ac, LLVMRealUNE, src[0], src[1]);
 		break;
 	case nir_op_flt32:
@@ -741,9 +856,11 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
 		break;
 	case nir_op_frsq:
-		result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
-		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
-		result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(result), 1.0), result);
+		result = emit_intrin_1f_param(&ctx->ac, "llvm.amdgcn.rsq",
+					      ac_to_float_type(&ctx->ac, def_type), src[0]);
+		if (ctx->abi->clamp_div_by_zero)
+			result = ac_build_fmin(&ctx->ac, result,
+					       LLVMConstReal(ac_to_float_type(&ctx->ac, def_type), FLT_MAX));
 		break;
 	case nir_op_frexp_exp:
 		src[0] = ac_to_float(&ctx->ac, src[0]);
@@ -785,7 +902,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 	case nir_op_ffma:
 		/* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
 		result = emit_intrin_3f_param(&ctx->ac, ctx->ac.chip_class >= GFX10 ? "llvm.fma" : "llvm.fmuladd",
-		                              ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]);
+					      ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]);
 		break;
 	case nir_op_ldexp:
 		src[0] = ac_to_float(&ctx->ac, src[0]);
@@ -846,15 +963,45 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
 		break;
 	case nir_op_f2f16_rtz:
+	case nir_op_f2f16:
+	case nir_op_f2fmp:
 		src[0] = ac_to_float(&ctx->ac, src[0]);
-		if (LLVMTypeOf(src[0]) == ctx->ac.f64)
-			src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
-		LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 };
-		result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
-		result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
+
+		/* For OpenGL, we want fast packing with v_cvt_pkrtz_f16, but if we use it,
+		 * all f32->f16 conversions have to round towards zero, because both scalar
+		 * and vec2 down-conversions have to round equally.
+		 */
+		if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL ||
+		    instr->op == nir_op_f2f16_rtz) {
+			src[0] = ac_to_float(&ctx->ac, src[0]);
+
+			if (LLVMTypeOf(src[0]) == ctx->ac.f64)
+				src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
+
+			/* Fast path conversion. This only works if NIR is vectorized
+			 * to vec2 16.
+			 */
+			if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) {
+				LLVMValueRef args[] = {
+					ac_llvm_extract_elem(&ctx->ac, src[0], 0),
+					ac_llvm_extract_elem(&ctx->ac, src[0], 1),
+				};
+				result = ac_build_cvt_pkrtz_f16(&ctx->ac, args);
+				break;
+			}
+
+			assert(ac_get_llvm_num_components(src[0]) == 1);
+			LLVMValueRef param[2] = { src[0], LLVMGetUndef(ctx->ac.f32) };
+			result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
+			result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
+		} else {
+			if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
+				result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+			else
+				result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+		}
 		break;
 	case nir_op_f2f16_rtne:
-	case nir_op_f2f16:
 	case nir_op_f2f32:
 	case nir_op_f2f64:
 		src[0] = ac_to_float(&ctx->ac, src[0]);
@@ -865,6 +1012,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		break;
 	case nir_op_u2u8:
 	case nir_op_u2u16:
+	case nir_op_u2ump:
 	case nir_op_u2u32:
 	case nir_op_u2u64:
 		if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
@@ -874,6 +1022,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		break;
 	case nir_op_i2i8:
 	case nir_op_i2i16:
+	case nir_op_i2imp:
 	case nir_op_i2i32:
 	case nir_op_i2i64:
 		if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
@@ -1027,57 +1176,6 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		break;
 	}
 
-	case nir_op_fmin3:
-		result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum",
-						ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
-		result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum",
-						ac_to_float_type(&ctx->ac, def_type), result, src[2]);
-		break;
-	case nir_op_umin3:
-		result = ac_build_umin(&ctx->ac, src[0], src[1]);
-		result = ac_build_umin(&ctx->ac, result, src[2]);
-		break;
-	case nir_op_imin3:
-		result = ac_build_imin(&ctx->ac, src[0], src[1]);
-		result = ac_build_imin(&ctx->ac, result, src[2]);
-		break;
-	case nir_op_fmax3:
-		result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum",
-						ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
-		result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum",
-						ac_to_float_type(&ctx->ac, def_type), result, src[2]);
-		break;
-	case nir_op_umax3:
-		result = ac_build_umax(&ctx->ac, src[0], src[1]);
-		result = ac_build_umax(&ctx->ac, result, src[2]);
-		break;
-	case nir_op_imax3:
-		result = ac_build_imax(&ctx->ac, src[0], src[1]);
-		result = ac_build_imax(&ctx->ac, result, src[2]);
-		break;
-	case nir_op_fmed3: {
-		src[0] = ac_to_float(&ctx->ac, src[0]);
-		src[1] = ac_to_float(&ctx->ac, src[1]);
-		src[2] = ac_to_float(&ctx->ac, src[2]);
-		result = ac_build_fmed3(&ctx->ac, src[0], src[1], src[2],
-					instr->dest.dest.ssa.bit_size);
-		break;
-	}
-	case nir_op_imed3: {
-		LLVMValueRef tmp1 = ac_build_imin(&ctx->ac, src[0], src[1]);
-		LLVMValueRef tmp2 = ac_build_imax(&ctx->ac, src[0], src[1]);
-		tmp2 = ac_build_imin(&ctx->ac, tmp2, src[2]);
-		result = ac_build_imax(&ctx->ac, tmp1, tmp2);
-		break;
-	}
-	case nir_op_umed3: {
-		LLVMValueRef tmp1 = ac_build_umin(&ctx->ac, src[0], src[1]);
-		LLVMValueRef tmp2 = ac_build_umax(&ctx->ac, src[0], src[1]);
-		tmp2 = ac_build_umin(&ctx->ac, tmp2, src[2]);
-		result = ac_build_umax(&ctx->ac, tmp1, tmp2);
-		break;
-	}
-
 	default:
 		fprintf(stderr, "Unknown NIR alu instr: ");
 		nir_print_instr(&instr->instr, stderr);
@@ -1336,12 +1434,14 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
 	if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
 		unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
 
+		assert(instr->dest.is_ssa);
 		return ac_build_buffer_load_format(&ctx->ac,
 			                           args->resource,
 			                           args->coords[0],
 			                           ctx->ac.i32_0,
 			                           util_last_bit(mask),
-			                           0, true);
+			                           0, true,
+						   instr->dest.ssa.bit_size == 16);
 	}
 
 	args->opcode = ac_image_sample;
@@ -1370,11 +1470,17 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
 		break;
 	case nir_texop_tg4:
 		args->opcode = ac_image_gather4;
-		args->level_zero = true;
+                if (!args->lod && !args->bias)
+			args->level_zero = true;
 		break;
 	case nir_texop_lod:
 		args->opcode = ac_image_get_lod;
 		break;
+	case nir_texop_fragment_fetch:
+	case nir_texop_fragment_mask_fetch:
+		args->opcode = ac_image_load;
+		args->level_zero = false;
+		break;
 	default:
 		break;
 	}
@@ -1468,13 +1574,13 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx,
 
 	if (instr->dest.ssa.bit_size == 8) {
 		unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1;
-		LLVMTypeRef vec_type = LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 4 * load_dwords);
+		LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i8, 4 * load_dwords);
 		ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
 		LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
 
 		LLVMValueRef params[3];
 		if (load_dwords > 1) {
-			LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i32, 2), "");
+			LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.v2i32, "");
 			params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 1, false), "");
 			params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 0, false), "");
 		} else {
@@ -1487,11 +1593,11 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx,
 
 		res = LLVMBuildTrunc(ctx->ac.builder, res, LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), "");
 		if (instr->dest.ssa.num_components > 1)
-			res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), instr->dest.ssa.num_components), "");
+			res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i8, instr->dest.ssa.num_components), "");
 		return res;
 	} else if (instr->dest.ssa.bit_size == 16) {
 		unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1;
-		LLVMTypeRef vec_type = LLVMVectorType(LLVMInt16TypeInContext(ctx->ac.context), 2 * load_dwords);
+		LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i16, 2 * load_dwords);
 		ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
 		LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
 		res = LLVMBuildBitCast(ctx->ac.builder, res, vec_type, "");
@@ -1575,14 +1681,29 @@ static unsigned get_cache_policy(struct ac_nir_context *ctx,
 	}
 
 	if (access & ACCESS_STREAM_CACHE_POLICY)
-		cache_policy |= ac_slc;
+		cache_policy |= ac_slc | ac_glc;
 
 	return cache_policy;
 }
 
+static LLVMValueRef enter_waterfall_ssbo(struct ac_nir_context *ctx,
+					 struct waterfall_context *wctx,
+					 const nir_intrinsic_instr *instr,
+					 nir_src src)
+{
+	return enter_waterfall(ctx, wctx, get_src(ctx, src),
+			       nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
+}
+
 static void visit_store_ssbo(struct ac_nir_context *ctx,
                              nir_intrinsic_instr *instr)
 {
+	if (ctx->ac.postponed_kill) {
+		LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+		ac_build_ifcc(&ctx->ac, cond, 7000);
+        }
+
 	LLVMValueRef src_data = get_src(ctx, instr->src[0]);
 	int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8;
 	unsigned writemask = nir_intrinsic_write_mask(instr);
@@ -1590,8 +1711,10 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
 	bool writeonly_memory = access & ACCESS_NON_READABLE;
 	unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory);
 
-	LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
-				        get_src(ctx, instr->src[1]), true);
+	struct waterfall_context wctx;
+	LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[1]);
+
+	LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, true);
 	LLVMValueRef base_data = src_data;
 	base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components);
 	LLVMValueRef base_offset = get_src(ctx, instr->src[2]);
@@ -1626,6 +1749,16 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
 			count = 1;
 			num_bytes = 2;
 		}
+
+		/* Due to alignment issues, split stores of 8-bit/16-bit
+		 * vectors.
+		 */
+		if (ctx->ac.chip_class == GFX6 && count > 1 && elem_size_bytes < 4) {
+			writemask |= ((1u << (count - 1)) - 1u) << (start + 1);
+			count = 1;
+			num_bytes = elem_size_bytes;
+		}
+
 		data = extract_vector_range(&ctx->ac, base_data, start, count);
 
 		offset = LLVMBuildAdd(ctx->ac.builder, base_offset,
@@ -1666,6 +1799,11 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
 						    cache_policy);
 		}
 	}
+
+	exit_waterfall(ctx, &wctx, NULL);
+
+	if (ctx->ac.postponed_kill)
+		ac_build_endif(&ctx->ac, 7000);
 }
 
 static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx,
@@ -1727,14 +1865,24 @@ static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx,
 }
 
 static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
-                                      const nir_intrinsic_instr *instr)
+                                      nir_intrinsic_instr *instr)
 {
+	if (ctx->ac.postponed_kill) {
+		LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+		ac_build_ifcc(&ctx->ac, cond, 7001);
+        }
+
 	LLVMTypeRef return_type = LLVMTypeOf(get_src(ctx, instr->src[2]));
 	const char *op;
 	char name[64], type[8];
 	LLVMValueRef params[6], descriptor;
+	LLVMValueRef result;
 	int arg_count = 0;
 
+	struct waterfall_context wctx;
+	LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]);
+
 	switch (instr->intrinsic) {
 	case nir_intrinsic_ssbo_atomic_add:
 		op = "add";
@@ -1771,58 +1919,66 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
 	}
 
 	descriptor = ctx->abi->load_ssbo(ctx->abi,
-	                                 get_src(ctx, instr->src[0]),
+	                                 rsrc_base,
 	                                 true);
 
 	if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap &&
 	    return_type == ctx->ac.i64) {
-		return emit_ssbo_comp_swap_64(ctx, descriptor,
-					      get_src(ctx, instr->src[1]),
-					      get_src(ctx, instr->src[2]),
-					      get_src(ctx, instr->src[3]));
-	}
-	if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
-		params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
-	}
-	params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
-	params[arg_count++] = descriptor;
+		result = emit_ssbo_comp_swap_64(ctx, descriptor,
+					        get_src(ctx, instr->src[1]),
+					        get_src(ctx, instr->src[2]),
+					        get_src(ctx, instr->src[3]));
+	} else {
+		if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
+			params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
+		}
+		params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
+		params[arg_count++] = descriptor;
 
-	if (LLVM_VERSION_MAJOR >= 9) {
-		/* XXX: The new raw/struct atomic intrinsics are buggy with
-		 * LLVM 8, see r358579.
-		 */
-		params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
-		params[arg_count++] = ctx->ac.i32_0; /* soffset */
-		params[arg_count++] = ctx->ac.i32_0; /* slc */
+		if (LLVM_VERSION_MAJOR >= 9) {
+			/* XXX: The new raw/struct atomic intrinsics are buggy with
+			* LLVM 8, see r358579.
+			*/
+			params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
+			params[arg_count++] = ctx->ac.i32_0; /* soffset */
+			params[arg_count++] = ctx->ac.i32_0; /* slc */
+
+			ac_build_type_name_for_intr(return_type, type, sizeof(type));
+			snprintf(name, sizeof(name),
+			         "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
+		} else {
+			params[arg_count++] = ctx->ac.i32_0; /* vindex */
+			params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
+			params[arg_count++] = ctx->ac.i1false; /* slc */
 
-		ac_build_type_name_for_intr(return_type, type, sizeof(type));
-		snprintf(name, sizeof(name),
-		         "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
-	} else {
-		params[arg_count++] = ctx->ac.i32_0; /* vindex */
-		params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
-		params[arg_count++] = ctx->ac.i1false; /* slc */
+			assert(return_type == ctx->ac.i32);
+			snprintf(name, sizeof(name),
+			         "llvm.amdgcn.buffer.atomic.%s", op);
+		}
 
-		assert(return_type == ctx->ac.i32);
-		snprintf(name, sizeof(name),
-			 "llvm.amdgcn.buffer.atomic.%s", op);
+		result = ac_build_intrinsic(&ctx->ac, name, return_type, params,
+		                            arg_count, 0);
 	}
 
-	return ac_build_intrinsic(&ctx->ac, name, return_type, params,
-				  arg_count, 0);
+	result = exit_waterfall(ctx, &wctx, result);
+        if (ctx->ac.postponed_kill)
+		ac_build_endif(&ctx->ac, 7001);
+	return result;
 }
 
 static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
-                                      const nir_intrinsic_instr *instr)
+                                      nir_intrinsic_instr *instr)
 {
+	struct waterfall_context wctx;
+	LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]);
+
 	int elem_size_bytes = instr->dest.ssa.bit_size / 8;
 	int num_components = instr->num_components;
 	enum gl_access_qualifier access = nir_intrinsic_access(instr);
 	unsigned cache_policy = get_cache_policy(ctx, access, false, false);
 
 	LLVMValueRef offset = get_src(ctx, instr->src[1]);
-	LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
-						get_src(ctx, instr->src[0]), false);
+	LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, false);
 	LLVMValueRef vindex = ctx->ac.i32_0;
 
 	LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa);
@@ -1877,14 +2033,26 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
 		i += num_elems;
 	}
 
-	return ac_build_gather_values(&ctx->ac, results, num_components);
+	LLVMValueRef ret =  ac_build_gather_values(&ctx->ac, results, num_components);
+	return exit_waterfall(ctx, &wctx, ret);
+}
+
+static LLVMValueRef enter_waterfall_ubo(struct ac_nir_context *ctx,
+					struct waterfall_context *wctx,
+					const nir_intrinsic_instr *instr)
+{
+	return enter_waterfall(ctx, wctx, get_src(ctx, instr->src[0]),
+			       nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
 }
 
 static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
-                                          const nir_intrinsic_instr *instr)
+                                          nir_intrinsic_instr *instr)
 {
+	struct waterfall_context wctx;
+	LLVMValueRef rsrc_base = enter_waterfall_ubo(ctx, &wctx, instr);
+
 	LLVMValueRef ret;
-	LLVMValueRef rsrc = get_src(ctx, instr->src[0]);
+	LLVMValueRef rsrc = rsrc_base;
 	LLVMValueRef offset = get_src(ctx, instr->src[1]);
 	int num_components = instr->num_components;
 
@@ -1926,8 +2094,10 @@ static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
 		ret = ac_trim_vector(&ctx->ac, ret, num_components);
 	}
 
-	return LLVMBuildBitCast(ctx->ac.builder, ret,
+	ret = LLVMBuildBitCast(ctx->ac.builder, ret,
 	                        get_def_type(ctx, &instr->dest.ssa), "");
+
+	return exit_waterfall(ctx, &wctx, ret);
 }
 
 static void
@@ -2092,6 +2262,7 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 
 	switch (mode) {
 	case nir_var_shader_in:
+		/* TODO: remove this after RADV switches to lowered IO */
 		if (ctx->stage == MESA_SHADER_TESS_CTRL ||
 		    ctx->stage == MESA_SHADER_TESS_EVAL) {
 			return load_tess_varyings(ctx, instr, true);
@@ -2147,6 +2318,7 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 		}
 		break;
 	case nir_var_shader_out:
+		/* TODO: remove this after RADV switches to lowered IO */
 		if (ctx->stage == MESA_SHADER_TESS_CTRL) {
 			return load_tess_varyings(ctx, instr, false);
 		}
@@ -2177,20 +2349,28 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 		break;
 	case nir_var_mem_global:  {
 		LLVMValueRef address = get_src(ctx, instr->src[0]);
+		LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa);
 		unsigned explicit_stride = glsl_get_explicit_stride(deref->type);
 		unsigned natural_stride = type_scalar_size_bytes(deref->type);
 		unsigned stride = explicit_stride ? explicit_stride : natural_stride;
+		int elem_size_bytes = ac_get_elem_bits(&ctx->ac, result_type) / 8;
+		bool split_loads = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4;
 
-		LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa);
-		if (stride != natural_stride) {
-			LLVMTypeRef ptr_type =  LLVMPointerType(LLVMGetElementType(result_type),
-			                                        LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+		if (stride != natural_stride || split_loads) {
+			if (LLVMGetTypeKind(result_type) == LLVMVectorTypeKind)
+				result_type = LLVMGetElementType(result_type);
+
+			LLVMTypeRef ptr_type = LLVMPointerType(result_type,
+							       LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
 			address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
 
 			for (unsigned i = 0; i < instr->dest.ssa.num_components; ++i) {
 				LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i * stride / natural_stride, 0);
 				values[i] = LLVMBuildLoad(ctx->ac.builder,
 				                          ac_build_gep_ptr(&ctx->ac, address, offset), "");
+
+				if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
+					LLVMSetOrdering(values[i], LLVMAtomicOrderingMonotonic);
 			}
 			return ac_build_gather_values(&ctx->ac, values, instr->dest.ssa.num_components);
 		} else {
@@ -2198,6 +2378,9 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 			                                        LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
 			address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
 			LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, "");
+
+			if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
+				LLVMSetOrdering(val, LLVMAtomicOrderingMonotonic);
 			return val;
 		}
 	}
@@ -2212,6 +2395,12 @@ static void
 visit_store_var(struct ac_nir_context *ctx,
 		nir_intrinsic_instr *instr)
 {
+	if (ctx->ac.postponed_kill) {
+		LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+		ac_build_ifcc(&ctx->ac, cond, 7002);
+        }
+
 	nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
 	nir_variable *var = nir_deref_instr_get_variable(deref);
 
@@ -2250,7 +2439,7 @@ visit_store_var(struct ac_nir_context *ctx,
 
 	switch (deref->mode) {
 	case nir_var_shader_out:
-
+		/* TODO: remove this after RADV switches to lowered IO */
 		if (ctx->stage == MESA_SHADER_TESS_CTRL) {
 			LLVMValueRef vertex_index = NULL;
 			LLVMValueRef indir_index = NULL;
@@ -2265,8 +2454,10 @@ visit_store_var(struct ac_nir_context *ctx,
 
 			ctx->abi->store_tcs_outputs(ctx->abi, var,
 						    vertex_index, indir_index,
-						    const_index, src, writemask);
-			return;
+						    const_index, src, writemask,
+						    var->data.location_frac,
+						    var->data.driver_location);
+			break;
 		}
 
 		for (unsigned chan = 0; chan < 8; chan++) {
@@ -2332,23 +2523,32 @@ visit_store_var(struct ac_nir_context *ctx,
 		unsigned explicit_stride = glsl_get_explicit_stride(deref->type);
 		unsigned natural_stride = type_scalar_size_bytes(deref->type);
 		unsigned stride = explicit_stride ? explicit_stride : natural_stride;
+		int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(val)) / 8;
+		bool split_stores = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4;
 
 		LLVMTypeRef ptr_type =  LLVMPointerType(LLVMTypeOf(val),
 							LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
 		address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
 
 		if (writemask == (1u << ac_get_llvm_num_components(val)) - 1 &&
-		    stride == natural_stride) {
-			LLVMTypeRef ptr_type =  LLVMPointerType(LLVMTypeOf(val),
-			                                        LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+		    stride == natural_stride && !split_stores) {
+			LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(val),
+			                                       LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
 			address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
 
 			val = LLVMBuildBitCast(ctx->ac.builder, val,
 			                       LLVMGetElementType(LLVMTypeOf(address)), "");
-			LLVMBuildStore(ctx->ac.builder, val, address);
+			LLVMValueRef store = LLVMBuildStore(ctx->ac.builder, val, address);
+
+			if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
+				LLVMSetOrdering(store, LLVMAtomicOrderingMonotonic);
 		} else {
-			LLVMTypeRef ptr_type =  LLVMPointerType(LLVMGetElementType(LLVMTypeOf(val)),
-			                                        LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+			LLVMTypeRef val_type = LLVMTypeOf(val);
+			if (LLVMGetTypeKind(LLVMTypeOf(val)) == LLVMVectorTypeKind)
+				val_type = LLVMGetElementType(val_type);
+
+			LLVMTypeRef ptr_type = LLVMPointerType(val_type,
+							       LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
 			address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
 			for (unsigned chan = 0; chan < 4; chan++) {
 				if (!(writemask & (1 << chan)))
@@ -2361,7 +2561,10 @@ visit_store_var(struct ac_nir_context *ctx,
 									chan);
 				src = LLVMBuildBitCast(ctx->ac.builder, src,
 				                       LLVMGetElementType(LLVMTypeOf(ptr)), "");
-				LLVMBuildStore(ctx->ac.builder, src, ptr);
+				LLVMValueRef store = LLVMBuildStore(ctx->ac.builder, src, ptr);
+
+				if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
+					LLVMSetOrdering(store, LLVMAtomicOrderingMonotonic);
 			}
 		}
 		break;
@@ -2370,6 +2573,74 @@ visit_store_var(struct ac_nir_context *ctx,
 		abort();
 		break;
 	}
+
+	if (ctx->ac.postponed_kill)
+		ac_build_endif(&ctx->ac, 7002);
+}
+
+static void
+visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
+{
+	if (ctx->ac.postponed_kill) {
+		LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+		ac_build_ifcc(&ctx->ac, cond, 7002);
+	}
+
+	unsigned base = nir_intrinsic_base(instr);
+	unsigned writemask = nir_intrinsic_write_mask(instr);
+	unsigned component = nir_intrinsic_component(instr);
+	LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0]));
+	nir_src offset = *nir_get_io_offset_src(instr);
+	LLVMValueRef indir_index = NULL;
+
+	if (nir_src_is_const(offset))
+		assert(nir_src_as_uint(offset) == 0);
+	else
+		indir_index = get_src(ctx, offset);
+
+	switch (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src))) {
+	case 32:
+		break;
+	case 64:
+		writemask = widen_mask(writemask, 2);
+		src = LLVMBuildBitCast(ctx->ac.builder, src,
+		                       LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2),
+		                       "");
+		break;
+	default:
+		unreachable("unhandled store_output bit size");
+		return;
+	}
+
+	writemask <<= component;
+
+	if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+		nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+		LLVMValueRef vertex_index =
+				vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL;
+
+		ctx->abi->store_tcs_outputs(ctx->abi, NULL,
+					    vertex_index, indir_index,
+					    0, src, writemask,
+					    component, base * 4);
+		return;
+	}
+
+	/* No indirect indexing is allowed after this point. */
+	assert(!indir_index);
+
+	for (unsigned chan = 0; chan < 8; chan++) {
+		if (!(writemask & (1 << chan)))
+			continue;
+
+		LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
+		LLVMBuildStore(ctx->ac.builder, value,
+			       ctx->abi->outputs[base * 4 + chan]);
+	}
+
+	if (ctx->ac.postponed_kill)
+		ac_build_endif(&ctx->ac, 7002);
 }
 
 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
@@ -2419,6 +2690,7 @@ static nir_deref_instr *get_image_deref(const nir_intrinsic_instr *instr)
 
 static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx,
                                          const nir_intrinsic_instr *instr,
+                                         LLVMValueRef dynamic_index,
                                          enum ac_descriptor_type desc_type,
                                          bool write)
 {
@@ -2426,11 +2698,12 @@ static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx,
 		instr->src[0].ssa->parent_instr->type == nir_instr_type_deref ?
 		nir_instr_as_deref(instr->src[0].ssa->parent_instr) : NULL;
 
-	return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, true, write);
+	return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, dynamic_index, true, write);
 }
 
 static void get_image_coords(struct ac_nir_context *ctx,
 			     const nir_intrinsic_instr *instr,
+			     LLVMValueRef dynamic_desc_index,
 			     struct ac_image_args *args,
 			     enum glsl_sampler_dim dim,
 			     bool is_array)
@@ -2468,7 +2741,7 @@ static void get_image_coords(struct ac_nir_context *ctx,
 							       fmask_load_address[2],
 							       sample_index,
 							       get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
-										AC_DESC_FMASK, &instr->instr, true, false));
+										AC_DESC_FMASK, &instr->instr, dynamic_desc_index, true, false));
 	}
 	if (count == 1 && !gfx9_1d) {
 		if (instr->src[1].ssa->num_components)
@@ -2520,9 +2793,10 @@ static void get_image_coords(struct ac_nir_context *ctx,
 
 static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
                                                 const nir_intrinsic_instr *instr,
+						LLVMValueRef dynamic_index,
 						bool write, bool atomic)
 {
-	LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_BUFFER, write);
+	LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, write);
 	if (ctx->ac.chip_class == GFX9 && LLVM_VERSION_MAJOR < 9 && atomic) {
 		LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), "");
 		LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), "");
@@ -2538,6 +2812,19 @@ static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
 	return rsrc;
 }
 
+static LLVMValueRef enter_waterfall_image(struct ac_nir_context *ctx,
+					  struct waterfall_context *wctx,
+					  const nir_intrinsic_instr *instr)
+{
+	nir_deref_instr *deref_instr = NULL;
+
+	if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref)
+		deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+
+	LLVMValueRef value = get_sampler_desc_index(ctx, deref_instr, &instr->instr, true);
+	return enter_waterfall(ctx, wctx, value, nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
+}
+
 static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
 				     const nir_intrinsic_instr *instr,
 				     bool bindless)
@@ -2545,21 +2832,23 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
 	LLVMValueRef res;
 
 	enum glsl_sampler_dim dim;
-	enum gl_access_qualifier access;
+	enum gl_access_qualifier access = nir_intrinsic_access(instr);
 	bool is_array;
 	if (bindless) {
 		dim = nir_intrinsic_image_dim(instr);
-		access = nir_intrinsic_access(instr);
 		is_array = nir_intrinsic_image_array(instr);
 	} else {
 		const nir_deref_instr *image_deref = get_image_deref(instr);
 		const struct glsl_type *type = image_deref->type;
 		const nir_variable *var = nir_deref_instr_get_variable(image_deref);
 		dim = glsl_get_sampler_dim(type);
-		access = var->data.access;
+		access |= var->data.access;
 		is_array = glsl_sampler_type_is_array(type);
 	}
 
+	struct waterfall_context wctx;
+	LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+
 	struct ac_image_args args = {};
 
 	args.cache_policy = get_cache_policy(ctx, access, false, false);
@@ -2569,15 +2858,17 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
 		unsigned num_channels = util_last_bit(mask);
 		LLVMValueRef rsrc, vindex;
 
-		rsrc = get_image_buffer_descriptor(ctx, instr, false, false);
+		rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, false, false);
 		vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
 						 ctx->ac.i32_0, "");
 
+		assert(instr->dest.is_ssa);
 		bool can_speculate = access & ACCESS_CAN_REORDER;
 		res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex,
 						  ctx->ac.i32_0, num_channels,
 						  args.cache_policy,
-						  can_speculate);
+						  can_speculate,
+						  instr->dest.ssa.bit_size == 16);
 		res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels);
 
 		res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components);
@@ -2586,48 +2877,58 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
 		bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
 
 		args.opcode = level_zero ? ac_image_load : ac_image_load_mip;
-		args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
-		get_image_coords(ctx, instr, &args, dim, is_array);
+		args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
+		get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
 		args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
 		if (!level_zero)
 			args.lod = get_src(ctx, instr->src[3]);
 		args.dmask = 15;
 		args.attributes = AC_FUNC_ATTR_READONLY;
 
+		assert(instr->dest.is_ssa);
+		args.d16 = instr->dest.ssa.bit_size == 16;
+
 		res = ac_build_image_opcode(&ctx->ac, &args);
 	}
-	return res;
+	return exit_waterfall(ctx, &wctx, res);
 }
 
 static void visit_image_store(struct ac_nir_context *ctx,
-			      nir_intrinsic_instr *instr,
+			      const nir_intrinsic_instr *instr,
 			      bool bindless)
 {
-
+	if (ctx->ac.postponed_kill) {
+		LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+		ac_build_ifcc(&ctx->ac, cond, 7003);
+        }
 
 	enum glsl_sampler_dim dim;
-	enum gl_access_qualifier access;
+	enum gl_access_qualifier access = nir_intrinsic_access(instr);
 	bool is_array;
+
 	if (bindless) {
 		dim = nir_intrinsic_image_dim(instr);
-		access = nir_intrinsic_access(instr);
 		is_array = nir_intrinsic_image_array(instr);
 	} else {
 		const nir_deref_instr *image_deref = get_image_deref(instr);
 		const struct glsl_type *type = image_deref->type;
 		const nir_variable *var = nir_deref_instr_get_variable(image_deref);
 		dim = glsl_get_sampler_dim(type);
-		access = var->data.access;
+		access |= var->data.access;
 		is_array = glsl_sampler_type_is_array(type);
 	}
 
+	struct waterfall_context wctx;
+	LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+
 	bool writeonly_memory = access & ACCESS_NON_READABLE;
 	struct ac_image_args args = {};
 
 	args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory);
 
 	if (dim == GLSL_SAMPLER_DIM_BUF) {
-		LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true, false);
+		LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, false);
 		LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
 		unsigned src_channels = ac_get_llvm_num_components(src);
 		LLVMValueRef vindex;
@@ -2640,29 +2941,38 @@ static void visit_image_store(struct ac_nir_context *ctx,
 						 ctx->ac.i32_0, "");
 
 		ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex,
-					     ctx->ac.i32_0, src_channels,
-					     args.cache_policy);
+					     ctx->ac.i32_0, args.cache_policy);
 	} else {
 		bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
 
 		args.opcode = level_zero ? ac_image_store : ac_image_store_mip;
 		args.data[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
-		args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true);
-		get_image_coords(ctx, instr, &args, dim, is_array);
+		args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true);
+		get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
 		args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
 		if (!level_zero)
 			args.lod = get_src(ctx, instr->src[4]);
 		args.dmask = 15;
+		args.d16 = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.data[0])) == 16;
 
 		ac_build_image_opcode(&ctx->ac, &args);
 	}
 
+	exit_waterfall(ctx, &wctx, NULL);
+	if (ctx->ac.postponed_kill)
+		ac_build_endif(&ctx->ac, 7003);
 }
 
 static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
-                                       const nir_intrinsic_instr *instr,
-                                       bool bindless)
+				     const nir_intrinsic_instr *instr,
+				     bool bindless)
 {
+	if (ctx->ac.postponed_kill) {
+		LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+		ac_build_ifcc(&ctx->ac, cond, 7004);
+        }
+
 	LLVMValueRef params[7];
 	int param_count = 0;
 
@@ -2691,6 +3001,9 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
 		is_array = glsl_sampler_type_is_array(type);
 	}
 
+	struct waterfall_context wctx;
+	LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+
 	switch (instr->intrinsic) {
 	case nir_intrinsic_bindless_image_atomic_add:
 	case nir_intrinsic_image_deref_atomic_add:
@@ -2746,16 +3059,6 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
 	case nir_intrinsic_image_deref_atomic_inc_wrap: {
 		atomic_name = "inc";
 		atomic_subop = ac_atomic_inc_wrap;
-		/* ATOMIC_INC instruction does:
-		 *      value = (value + 1) % (data + 1)
-		 * but we want:
-		 *      value = (value + 1) % data
-		 * So replace 'data' by 'data - 1'.
-		 */
-		ctx->ssa_defs[instr->src[3].ssa->index] =
-			LLVMBuildSub(ctx->ac.builder,
-				     ctx->ssa_defs[instr->src[3].ssa->index],
-				     ctx->ac.i32_1, "");
 		break;
 	}
 	case nir_intrinsic_bindless_image_atomic_dec_wrap:
@@ -2771,8 +3074,9 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
 		params[param_count++] = get_src(ctx, instr->src[4]);
 	params[param_count++] = get_src(ctx, instr->src[3]);
 
+	LLVMValueRef result;
 	if (dim == GLSL_SAMPLER_DIM_BUF) {
-		params[param_count++] = get_image_buffer_descriptor(ctx, instr, true, true);
+		params[param_count++] = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, true);
 		params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
 								ctx->ac.i32_0, ""); /* vindex */
 		params[param_count++] = ctx->ac.i32_0; /* voffset */
@@ -2793,8 +3097,8 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
 		}
 
 		assert(length < sizeof(intrinsic_name));
-		return ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32,
-					  params, param_count, 0);
+		result = ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32,
+					    params, param_count, 0);
 	} else {
 		struct ac_image_args args = {};
 		args.opcode = cmpswap ? ac_image_atomic_cmpswap : ac_image_atomic;
@@ -2802,20 +3106,29 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
 		args.data[0] = params[0];
 		if (cmpswap)
 			args.data[1] = params[1];
-		args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true);
-		get_image_coords(ctx, instr, &args, dim, is_array);
+		args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true);
+		get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
 		args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
 
-		return ac_build_image_opcode(&ctx->ac, &args);
+		result = ac_build_image_opcode(&ctx->ac, &args);
 	}
+
+	result = exit_waterfall(ctx, &wctx, result);
+	if (ctx->ac.postponed_kill)
+		ac_build_endif(&ctx->ac, 7004);
+	return result;
 }
 
 static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx,
-					const nir_intrinsic_instr *instr)
+					nir_intrinsic_instr *instr)
 {
-	LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
+	struct waterfall_context wctx;
+	LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+	LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
+
+	LLVMValueRef ret = ac_build_image_get_sample_count(&ctx->ac, rsrc);
 
-	return ac_build_image_get_sample_count(&ctx->ac, rsrc);
+	return exit_waterfall(ctx, &wctx, ret);
 }
 
 static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
@@ -2835,35 +3148,41 @@ static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
 		is_array = glsl_sampler_type_is_array(type);
 	}
 
-	if (dim == GLSL_SAMPLER_DIM_BUF)
-		return get_buffer_size(ctx, get_image_descriptor(ctx, instr, AC_DESC_BUFFER, false), true);
+	struct waterfall_context wctx;
+	LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
 
-	struct ac_image_args args = { 0 };
+	if (dim == GLSL_SAMPLER_DIM_BUF) {
+		res =  get_buffer_size(ctx, get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, false), true);
+	} else {
 
-	args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
-	args.dmask = 0xf;
-	args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
-	args.opcode = ac_image_get_resinfo;
-	args.lod = ctx->ac.i32_0;
-	args.attributes = AC_FUNC_ATTR_READNONE;
+		struct ac_image_args args = { 0 };
 
-	res = ac_build_image_opcode(&ctx->ac, &args);
+		args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
+		args.dmask = 0xf;
+		args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
+		args.opcode = ac_image_get_resinfo;
+		assert(nir_src_as_uint(instr->src[1]) == 0);
+		args.lod = ctx->ac.i32_0;
+		args.attributes = AC_FUNC_ATTR_READNONE;
 
-	LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
+		res = ac_build_image_opcode(&ctx->ac, &args);
 
-	if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) {
-		LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
-		LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
-		z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
-		res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
-	}
-	if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
-		LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
-		res = LLVMBuildInsertElement(ctx->ac.builder, res, layers,
-						ctx->ac.i32_1, "");
+		LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
 
+		if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) {
+			LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
+			LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
+			z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
+			res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
+		}
+
+		if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
+			LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
+			res = LLVMBuildInsertElement(ctx->ac.builder, res, layers,
+			                             ctx->ac.i32_1, "");
+		}
 	}
-	return res;
+	return exit_waterfall(ctx, &wctx, res);
 }
 
 static void emit_membar(struct ac_llvm_context *ac,
@@ -2917,7 +3236,30 @@ static void emit_discard(struct ac_nir_context *ctx,
 		cond = ctx->ac.i1false;
 	}
 
-	ctx->abi->emit_kill(ctx->abi, cond);
+	ac_build_kill_if_false(&ctx->ac, cond);
+}
+
+static void emit_demote(struct ac_nir_context *ctx,
+			const nir_intrinsic_instr *instr)
+{
+	LLVMValueRef cond;
+
+	if (instr->intrinsic == nir_intrinsic_demote_if) {
+		cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
+				     get_src(ctx, instr->src[0]),
+				     ctx->ac.i32_0, "");
+	} else {
+		assert(instr->intrinsic == nir_intrinsic_demote);
+		cond = ctx->ac.i1false;
+	}
+
+	/* Kill immediately while maintaining WQM. */
+	ac_build_kill_if_false(&ctx->ac, ac_build_wqm_vote(&ctx->ac, cond));
+
+	LLVMValueRef mask = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
+	mask = LLVMBuildAnd(ctx->ac.builder, mask, cond, "");
+	LLVMBuildStore(ctx->ac.builder, mask, ctx->ac.postponed_kill);
+	return;
 }
 
 static LLVMValueRef
@@ -3024,6 +3366,12 @@ static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx,
 				     const nir_intrinsic_instr *instr,
 				     LLVMValueRef ptr, int src_idx)
 {
+	if (ctx->ac.postponed_kill) {
+		LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+		ac_build_ifcc(&ctx->ac, cond, 7005);
+        }
+
 	LLVMValueRef result;
 	LLVMValueRef src = get_src(ctx, instr->src[src_idx]);
 
@@ -3084,12 +3432,30 @@ static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx,
 		case nir_intrinsic_deref_atomic_exchange:
 			op = LLVMAtomicRMWBinOpXchg;
 			break;
+#if LLVM_VERSION_MAJOR >= 10
+		case nir_intrinsic_shared_atomic_fadd:
+		case nir_intrinsic_deref_atomic_fadd:
+			op = LLVMAtomicRMWBinOpFAdd;
+			break;
+#endif
 		default:
 			return NULL;
 		}
 
-		result = ac_build_atomic_rmw(&ctx->ac, op, ptr, ac_to_integer(&ctx->ac, src), sync_scope);
+		LLVMValueRef val;
+
+		if (instr->intrinsic == nir_intrinsic_shared_atomic_fadd ||
+		    instr->intrinsic == nir_intrinsic_deref_atomic_fadd) {
+			val = ac_to_float(&ctx->ac, src);
+		} else {
+			val = ac_to_integer(&ctx->ac, src);
+		}
+
+		result = ac_build_atomic_rmw(&ctx->ac, op, ptr, val, sync_scope);
 	}
+
+	if (ctx->ac.postponed_kill)
+		ac_build_endif(&ctx->ac, 7005);
 	return result;
 }
 
@@ -3223,6 +3589,13 @@ static LLVMValueRef barycentric_sample(struct ac_nir_context *ctx,
 	return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
 }
 
+static LLVMValueRef barycentric_model(struct ac_nir_context *ctx)
+{
+	return LLVMBuildBitCast(ctx->ac.builder,
+				ac_get_arg(&ctx->ac, ctx->args->pull_model),
+				ctx->ac.v3i32, "");
+}
+
 static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx,
 					    LLVMValueRef interp_param,
 					    unsigned index, unsigned comp_start,
@@ -3230,13 +3603,26 @@ static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx,
 					    unsigned bitsize)
 {
 	LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false);
+        LLVMValueRef interp_param_f;
 
-	interp_param = LLVMBuildBitCast(ctx->ac.builder,
+	interp_param_f = LLVMBuildBitCast(ctx->ac.builder,
 				interp_param, ctx->ac.v2f32, "");
 	LLVMValueRef i = LLVMBuildExtractElement(
-		ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
+		ctx->ac.builder, interp_param_f, ctx->ac.i32_0, "");
 	LLVMValueRef j = LLVMBuildExtractElement(
-		ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
+		ctx->ac.builder, interp_param_f, ctx->ac.i32_1, "");
+
+	/* Workaround for issue 2647: kill threads with infinite interpolation coeffs */
+	if (ctx->verified_interp &&
+            !_mesa_hash_table_search(ctx->verified_interp, interp_param)) {
+		LLVMValueRef args[2];
+		args[0] = i;
+		args[1] = LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, false);
+		LLVMValueRef cond = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
+                                                       args, 2, AC_FUNC_ATTR_READNONE);
+		ac_build_kill_if_false(&ctx->ac, LLVMBuildNot(ctx->ac.builder, cond, ""));
+                _mesa_hash_table_insert(ctx->verified_interp, interp_param, interp_param);
+	}
 
 	LLVMValueRef values[4];
 	assert(bitsize == 16 || bitsize == 32);
@@ -3254,40 +3640,121 @@ static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx,
 	return ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, num_components));
 }
 
-static LLVMValueRef load_flat_input(struct ac_nir_context *ctx,
-				    unsigned index, unsigned comp_start,
-				    unsigned num_components,
-				    unsigned bit_size)
+static LLVMValueRef visit_load(struct ac_nir_context *ctx,
+			       nir_intrinsic_instr *instr, bool is_output)
 {
-	LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false);
-
 	LLVMValueRef values[8];
+	LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa);
+	LLVMTypeRef component_type;
+	unsigned base = nir_intrinsic_base(instr);
+	unsigned component = nir_intrinsic_component(instr);
+	unsigned count = instr->dest.ssa.num_components *
+			 (instr->dest.ssa.bit_size == 64 ? 2 : 1);
+	nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+	LLVMValueRef vertex_index =
+		vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL;
+	nir_src offset = *nir_get_io_offset_src(instr);
+	LLVMValueRef indir_index = NULL;
+
+	if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind)
+		component_type = LLVMGetElementType(dest_type);
+	else
+		component_type = dest_type;
+
+	if (nir_src_is_const(offset))
+		assert(nir_src_as_uint(offset) == 0);
+	else
+		indir_index = get_src(ctx, offset);
+
+	if (ctx->stage == MESA_SHADER_TESS_CTRL ||
+	    (ctx->stage == MESA_SHADER_TESS_EVAL && !is_output)) {
+		LLVMValueRef result =
+			ctx->abi->load_tess_varyings(ctx->abi, component_type,
+						     vertex_index, indir_index,
+						     0, 0, base * 4,
+						     component,
+						     instr->num_components,
+						     false, false, !is_output);
+		if (instr->dest.ssa.bit_size == 16) {
+			result = ac_to_integer(&ctx->ac, result);
+			result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, "");
+		}
+		return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
+	}
+
+	/* No indirect indexing is allowed after this point. */
+	assert(!indir_index);
+
+	if (ctx->stage == MESA_SHADER_GEOMETRY) {
+		LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
+		assert(nir_src_is_const(*vertex_index_src));
+
+		return ctx->abi->load_inputs(ctx->abi, 0, base * 4, component,
+					     instr->num_components,
+					     nir_src_as_uint(*vertex_index_src),
+					     0, type);
+	}
+
+	if (ctx->stage == MESA_SHADER_FRAGMENT && is_output &&
+	    nir_intrinsic_io_semantics(instr).fb_fetch_output)
+		return ctx->abi->emit_fbfetch(ctx->abi);
 
-	/* Each component of a 64-bit value takes up two GL-level channels. */
-	unsigned channels =
-		bit_size == 64 ? num_components * 2 : num_components;
+	/* Other non-fragment cases have inputs and outputs in temporaries. */
+	if (ctx->stage != MESA_SHADER_FRAGMENT) {
+		for (unsigned chan = component; chan < count + component; chan++) {
+			if (is_output) {
+				values[chan] = LLVMBuildLoad(ctx->ac.builder,
+							     ctx->abi->outputs[base * 4 + chan], "");
+			} else {
+				values[chan] = ctx->abi->inputs[base * 4 + chan];
+				if (!values[chan])
+					values[chan] = LLVMGetUndef(ctx->ac.i32);
+			}
+		}
+		LLVMValueRef result = ac_build_varying_gather_values(&ctx->ac, values, count, component);
+		return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
+	}
+
+	/* Fragment shader inputs. */
+	unsigned vertex_id = 2; /* P0 */
+
+	if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
+		nir_const_value *src0 = nir_src_as_const_value(instr->src[0]);
+
+		switch (src0[0].i32) {
+		case 0:
+			vertex_id = 2;
+			break;
+		case 1:
+			vertex_id = 0;
+			break;
+		case 2:
+			vertex_id = 1;
+			break;
+		default:
+			unreachable("Invalid vertex index");
+		}
+	}
 
-	for (unsigned chan = 0; chan < channels; chan++) {
-		if (comp_start + chan > 4)
-			attr_number = LLVMConstInt(ctx->ac.i32, index + 1, false);
-		LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (comp_start + chan) % 4, false);
+	LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, base, false);
+
+	for (unsigned chan = 0; chan < count; chan++) {
+		if (component + chan > 4)
+			attr_number = LLVMConstInt(ctx->ac.i32, base + 1, false);
+		LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (component + chan) % 4, false);
 		values[chan] = ac_build_fs_interp_mov(&ctx->ac,
-						      LLVMConstInt(ctx->ac.i32, 2, false),
+						      LLVMConstInt(ctx->ac.i32, vertex_id, false),
 						      llvm_chan,
 						      attr_number,
 						      ac_get_arg(&ctx->ac, ctx->args->prim_mask));
 		values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i32, "");
 		values[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, values[chan],
-						       bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32, "");
+						       instr->dest.ssa.bit_size == 16 ? ctx->ac.i16
+										      : ctx->ac.i32, "");
 	}
 
-	LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, channels);
-	if (bit_size == 64) {
-		LLVMTypeRef type = num_components == 1 ? ctx->ac.i64 :
-			LLVMVectorType(ctx->ac.i64, num_components);
-		result = LLVMBuildBitCast(ctx->ac.builder, result, type, "");
-	}
-	return result;
+	LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, count);
+	return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
 }
 
 static void visit_intrinsic(struct ac_nir_context *ctx,
@@ -3408,6 +3875,9 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 	case nir_intrinsic_load_helper_invocation:
 		result = ac_build_load_helper_invocation(&ctx->ac);
 		break;
+	case nir_intrinsic_is_helper_invocation:
+		result = ac_build_is_helper_invocation(&ctx->ac);
+		break;
 	case nir_intrinsic_load_color0:
 		result = ctx->abi->color0;
 		break;
@@ -3481,6 +3951,19 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 	case nir_intrinsic_store_deref:
 		visit_store_var(ctx, instr);
 		break;
+	case nir_intrinsic_load_input:
+	case nir_intrinsic_load_input_vertex:
+	case nir_intrinsic_load_per_vertex_input:
+		result = visit_load(ctx, instr, false);
+		break;
+	case nir_intrinsic_load_output:
+	case nir_intrinsic_load_per_vertex_output:
+		result = visit_load(ctx, instr, true);
+		break;
+	case nir_intrinsic_store_output:
+	case nir_intrinsic_store_per_vertex_output:
+		visit_store_output(ctx, instr);
+		break;
 	case nir_intrinsic_load_shared:
 		result = visit_load_shared(ctx, instr);
 		break;
@@ -3538,12 +4021,17 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 		result = visit_image_size(ctx, instr, false);
 		break;
 	case nir_intrinsic_shader_clock:
-		result = ac_build_shader_clock(&ctx->ac);
+		result = ac_build_shader_clock(&ctx->ac,
+					       nir_intrinsic_memory_scope(instr));
 		break;
 	case nir_intrinsic_discard:
 	case nir_intrinsic_discard_if:
 		emit_discard(ctx, instr);
 		break;
+        case nir_intrinsic_demote:
+        case nir_intrinsic_demote_if:
+		emit_demote(ctx, instr);
+		break;
 	case nir_intrinsic_memory_barrier:
 	case nir_intrinsic_group_memory_barrier:
 	case nir_intrinsic_memory_barrier_buffer:
@@ -3551,6 +4039,25 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 	case nir_intrinsic_memory_barrier_shared:
 		emit_membar(&ctx->ac, instr);
 		break;
+	case nir_intrinsic_scoped_barrier: {
+		assert(!(nir_intrinsic_memory_semantics(instr) &
+			 (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
+
+		nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
+
+		unsigned wait_flags = 0;
+		if (modes & (nir_var_mem_global | nir_var_mem_ssbo))
+			wait_flags |= AC_WAIT_VLOAD | AC_WAIT_VSTORE;
+		if (modes & nir_var_mem_shared)
+			wait_flags |= AC_WAIT_LGKM;
+
+		if (wait_flags)
+			ac_build_waitcnt(&ctx->ac, wait_flags);
+
+		if (nir_intrinsic_execution_scope(instr) == NIR_SCOPE_WORKGROUP)
+			ac_emit_barrier(&ctx->ac, ctx->stage);
+		break;
+	}
 	case nir_intrinsic_memory_barrier_tcs_patch:
 		break;
 	case nir_intrinsic_control_barrier:
@@ -3565,7 +4072,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 	case nir_intrinsic_shared_atomic_or:
 	case nir_intrinsic_shared_atomic_xor:
 	case nir_intrinsic_shared_atomic_exchange:
-	case nir_intrinsic_shared_atomic_comp_swap: {
+	case nir_intrinsic_shared_atomic_comp_swap:
+	case nir_intrinsic_shared_atomic_fadd: {
 		LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0],
 						  instr->src[1].ssa->bit_size);
 		result = visit_var_atomic(ctx, instr, ptr, 1);
@@ -3580,7 +4088,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 	case nir_intrinsic_deref_atomic_or:
 	case nir_intrinsic_deref_atomic_xor:
 	case nir_intrinsic_deref_atomic_exchange:
-	case nir_intrinsic_deref_atomic_comp_swap: {
+	case nir_intrinsic_deref_atomic_comp_swap:
+	case nir_intrinsic_deref_atomic_fadd: {
 		LLVMValueRef ptr = get_src(ctx, instr->src[0]);
 		result = visit_var_atomic(ctx, instr, ptr, 1);
 		break;
@@ -3594,6 +4103,9 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 	case nir_intrinsic_load_barycentric_sample:
 		result = barycentric_sample(ctx, nir_intrinsic_interp_mode(instr));
 		break;
+	case nir_intrinsic_load_barycentric_model:
+		result = barycentric_model(ctx);
+		break;
 	case nir_intrinsic_load_barycentric_at_offset: {
 		LLVMValueRef offset = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0]));
 		result = barycentric_offset(ctx, nir_intrinsic_interp_mode(instr), offset);
@@ -3619,23 +4131,19 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 						 instr->dest.ssa.bit_size);
 		break;
 	}
-	case nir_intrinsic_load_input: {
-		/* We only lower inputs for fragment shaders ATM */
-		ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[0]);
-		assert(offset);
-		assert(offset[0].i32 == 0);
-
-		unsigned index = nir_intrinsic_base(instr);
-		unsigned component = nir_intrinsic_component(instr);
-		result = load_flat_input(ctx, index, component,
-					 instr->dest.ssa.num_components,
-					 instr->dest.ssa.bit_size);
-		break;
-	}
 	case nir_intrinsic_emit_vertex:
 		ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->abi->outputs);
 		break;
+	case nir_intrinsic_emit_vertex_with_counter: {
+		unsigned stream = nir_intrinsic_stream_id(instr);
+		LLVMValueRef next_vertex = get_src(ctx, instr->src[0]);
+		ctx->abi->emit_vertex_with_counter(ctx->abi, stream,
+						   next_vertex,
+						   ctx->abi->outputs);
+		break;
+	}
 	case nir_intrinsic_end_primitive:
+	case nir_intrinsic_end_primitive_with_counter:
 		ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr));
 		break;
 	case nir_intrinsic_load_tess_coord:
@@ -3667,8 +4175,33 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 		break;
 	}
 	case nir_intrinsic_shuffle:
-		result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]),
-				get_src(ctx, instr->src[1]));
+		if (ctx->ac.chip_class == GFX8 ||
+		    ctx->ac.chip_class == GFX9 ||
+		    (ctx->ac.chip_class >= GFX10 && ctx->ac.wave_size == 32)) {
+			result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]),
+						  get_src(ctx, instr->src[1]));
+		} else {
+			LLVMValueRef src = get_src(ctx, instr->src[0]);
+			LLVMValueRef index = get_src(ctx, instr->src[1]);
+			LLVMTypeRef type = LLVMTypeOf(src);
+	                struct waterfall_context wctx;
+	                LLVMValueRef index_val;
+
+	                index_val = enter_waterfall(ctx, &wctx, index, true);
+
+			src = LLVMBuildZExt(ctx->ac.builder, src,
+					    ctx->ac.i32, "");
+
+			result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.readlane",
+						    ctx->ac.i32,
+						    (LLVMValueRef []) { src, index_val }, 2,
+						    AC_FUNC_ATTR_READNONE |
+						    AC_FUNC_ATTR_CONVERGENT);
+
+			result = LLVMBuildTrunc(ctx->ac.builder, result, type, "");
+
+		        result = exit_waterfall(ctx, &wctx, result);
+		}
 		break;
 	case nir_intrinsic_reduce:
 		result = ac_build_reduce(&ctx->ac,
@@ -3826,11 +4359,20 @@ static LLVMValueRef get_bindless_index_from_uniform(struct ac_nir_context *ctx,
 	return LLVMBuildBitCast(ctx->ac.builder, ret, ctx->ac.i32, "");
 }
 
-static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
-				     nir_deref_instr *deref_instr,
-				     enum ac_descriptor_type desc_type,
-				     const nir_instr *instr,
-				     bool image, bool write)
+struct sampler_desc_address {
+	unsigned descriptor_set;
+	unsigned base_index; /* binding in vulkan */
+	unsigned constant_index;
+	LLVMValueRef dynamic_index;
+	bool image;
+	bool bindless;
+};
+
+static struct sampler_desc_address
+get_sampler_desc_internal(struct ac_nir_context *ctx,
+			  nir_deref_instr *deref_instr,
+			  const nir_instr *instr,
+			  bool image)
 {
 	LLVMValueRef index = NULL;
 	unsigned constant_index = 0;
@@ -3903,12 +4445,40 @@ static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
 		} else
 			base_index = deref_instr->var->data.binding;
 	}
+	return (struct sampler_desc_address) {
+		.descriptor_set = descriptor_set,
+		.base_index = base_index,
+		.constant_index = constant_index,
+		.dynamic_index = index,
+		.image = image,
+		.bindless = bindless,
+	};
+}
+
+/* Extract any possibly divergent index into a separate value that can be fed
+ * into get_sampler_desc with the same arguments. */
+static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx,
+					   nir_deref_instr *deref_instr,
+					   const nir_instr *instr,
+					   bool image)
+{
+	struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image);
+	return addr.dynamic_index;
+}
 
+static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
+				     nir_deref_instr *deref_instr,
+				     enum ac_descriptor_type desc_type,
+				     const nir_instr *instr,
+				     LLVMValueRef index,
+				     bool image, bool write)
+{
+	struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image);
 	return ctx->abi->load_sampler_desc(ctx->abi,
-					  descriptor_set,
-					  base_index,
-					  constant_index, index,
-					  desc_type, image, write, bindless);
+					  addr.descriptor_set,
+					  addr.base_index,
+					  addr.constant_index, index,
+					  desc_type, addr.image, write, addr.bindless);
 }
 
 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
@@ -3942,6 +4512,7 @@ static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx,
 
 static void tex_fetch_ptrs(struct ac_nir_context *ctx,
 			   nir_tex_instr *instr,
+			   struct waterfall_context *wctx,
 			   LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
 			   LLVMValueRef *fmask_ptr)
 {
@@ -3965,9 +4536,19 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx,
 		}
 	}
 
+	LLVMValueRef texture_dynamic_index = get_sampler_desc_index(ctx, texture_deref_instr,
+								    &instr->instr, false);
 	if (!sampler_deref_instr)
 		sampler_deref_instr = texture_deref_instr;
 
+        LLVMValueRef sampler_dynamic_index = get_sampler_desc_index(ctx, sampler_deref_instr,
+								    &instr->instr, false);
+	if (instr->texture_non_uniform)
+		texture_dynamic_index = enter_waterfall(ctx, wctx + 0, texture_dynamic_index, true);
+
+	if (instr->sampler_non_uniform)
+		sampler_dynamic_index = enter_waterfall(ctx, wctx + 1, sampler_dynamic_index, true);
+
 	enum ac_descriptor_type main_descriptor = instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE;
 
 	if (plane >= 0) {
@@ -3978,16 +4559,26 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx,
 		main_descriptor = AC_DESC_PLANE_0 + plane;
 	}
 
-	*res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr, false, false);
+	if (instr->op == nir_texop_fragment_mask_fetch) {
+		/* The fragment mask is fetched from the compressed
+		 * multisampled surface.
+		 */
+		main_descriptor = AC_DESC_FMASK;
+	}
+
+	*res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr,
+	                            texture_dynamic_index, false, false);
 
 	if (samp_ptr) {
-		*samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr, false, false);
+		*samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr,
+		                             sampler_dynamic_index, false, false);
 		if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT)
 			*samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
 	}
 	if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
 	                  instr->op == nir_texop_samples_identical))
-		*fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK, &instr->instr, false, false);
+		*fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK,
+		                              &instr->instr, texture_dynamic_index, false, false);
 }
 
 static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx,
@@ -4006,8 +4597,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 	LLVMValueRef fmask_ptr = NULL, sample_index = NULL;
 	LLVMValueRef ddx = NULL, ddy = NULL;
 	unsigned offset_src = 0;
+	struct waterfall_context wctx[2] = {{{0}}};
 
-	tex_fetch_ptrs(ctx, instr, &args.resource, &args.sampler, &fmask_ptr);
+	tex_fetch_ptrs(ctx, instr, wctx, &args.resource, &args.sampler, &fmask_ptr);
 
 	for (unsigned i = 0; i < instr->num_srcs; i++) {
 		switch (instr->src[i].src_type) {
@@ -4030,8 +4622,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 			offset_src = i;
 			break;
 		case nir_tex_src_bias:
-			if (instr->op == nir_texop_txb)
-				args.bias = get_src(ctx, instr->src[i].src);
+			args.bias = get_src(ctx, instr->src[i].src);
 			break;
 		case nir_tex_src_lod: {
 			if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0)
@@ -4051,6 +4642,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 		case nir_tex_src_ddy:
 			ddy = get_src(ctx, instr->src[i].src);
 			break;
+		case nir_tex_src_min_lod:
+			args.min_lod = get_src(ctx, instr->src[i].src);
+			break;
 		case nir_tex_src_texture_offset:
 		case nir_tex_src_sampler_offset:
 		case nir_tex_src_plane:
@@ -4066,6 +4660,8 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 
 	if (instr->op == nir_texop_texture_samples) {
 		LLVMValueRef res, samples, is_msaa;
+		LLVMValueRef default_sample;
+
 		res = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, "");
 		samples = LLVMBuildExtractElement(ctx->ac.builder, res,
 						  LLVMConstInt(ctx->ac.i32, 3, false), "");
@@ -4082,8 +4678,27 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 				       LLVMConstInt(ctx->ac.i32, 0xf, false), "");
 		samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1,
 				       samples, "");
+
+		if (ctx->abi->robust_buffer_access) {
+			LLVMValueRef dword1, is_null_descriptor;
+
+			/* Extract the second dword of the descriptor, if it's
+			 * all zero, then it's a null descriptor.
+			 */
+			dword1 = LLVMBuildExtractElement(ctx->ac.builder, res,
+							 LLVMConstInt(ctx->ac.i32, 1, false), "");
+			is_null_descriptor =
+				LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, dword1,
+					      LLVMConstInt(ctx->ac.i32, 0, false), "");
+			default_sample =
+				LLVMBuildSelect(ctx->ac.builder, is_null_descriptor,
+						ctx->ac.i32_0, ctx->ac.i32_1, "");
+		} else {
+			default_sample = ctx->ac.i32_1;
+		}
+
 		samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples,
-					  ctx->ac.i32_1, "");
+					  default_sample, "");
 		result = samples;
 		goto write_result;
 	}
@@ -4195,7 +4810,10 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 	     instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
 	     instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
 	    instr->is_array &&
-	    instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
+	    instr->op != nir_texop_txf &&
+	    instr->op != nir_texop_txf_ms &&
+	    instr->op != nir_texop_fragment_fetch &&
+	    instr->op != nir_texop_fragment_mask_fetch) {
 		args.coords[2] = apply_round_slice(&ctx->ac, args.coords[2]);
 	}
 
@@ -4214,7 +4832,8 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 	}
 
 	/* Pack sample index */
-	if (instr->op == nir_texop_txf_ms && sample_index)
+	if (sample_index && (instr->op == nir_texop_txf_ms ||
+			     instr->op == nir_texop_fragment_fetch))
 		args.coords[instr->coord_components] = sample_index;
 
 	if (instr->op == nir_texop_samples_identical) {
@@ -4233,7 +4852,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 
 	if ((instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ||
 	     instr->sampler_dim == GLSL_SAMPLER_DIM_MS) &&
-	    instr->op != nir_texop_txs) {
+	    instr->op != nir_texop_txs &&
+	    instr->op != nir_texop_fragment_fetch &&
+	    instr->op != nir_texop_fragment_mask_fetch) {
 		unsigned sample_chan = instr->is_array ? 3 : 2;
 		args.coords[sample_chan] = adjust_sample_index_using_fmask(
 			&ctx->ac, args.coords[0], args.coords[1],
@@ -4271,6 +4892,23 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 		args.dim = ac_get_sampler_dim(ctx->ac.chip_class, instr->sampler_dim, instr->is_array);
 		args.unorm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
 	}
+
+	/* Adjust the number of coordinates because we only need (x,y) for 2D
+	 * multisampled images and (x,y,layer) for 2D multisampled layered
+	 * images or for multisampled input attachments.
+	 */
+	if (instr->op == nir_texop_fragment_mask_fetch) {
+		if (args.dim == ac_image_2dmsaa) {
+			args.dim = ac_image_2d;
+		} else {
+			assert(args.dim == ac_image_2darraymsaa);
+			args.dim = ac_image_2darray;
+		}
+	}
+
+	assert(instr->dest.is_ssa);
+	args.d16 = instr->dest.ssa.bit_size == 16;
+
 	result = build_tex_intrinsic(ctx, instr, &args);
 
 	if (instr->op == nir_texop_query_levels)
@@ -4302,11 +4940,15 @@ write_result:
 	if (result) {
 		assert(instr->dest.is_ssa);
 		result = ac_to_integer(&ctx->ac, result);
+
+		for (int i = ARRAY_SIZE(wctx); --i >= 0;) {
+			result =  exit_waterfall(ctx, wctx + i, result);
+		}
+
 		ctx->ssa_defs[instr->dest.ssa.index] = result;
 	}
 }
 
-
 static void visit_phi(struct ac_nir_context *ctx, nir_phi_instr *instr)
 {
 	LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa);
@@ -4337,19 +4979,46 @@ static void phi_post_pass(struct ac_nir_context *ctx)
 }
 
 
+static bool is_def_used_in_an_export(const nir_ssa_def* def) {
+	nir_foreach_use(use_src, def) {
+		if (use_src->parent_instr->type == nir_instr_type_intrinsic) {
+			nir_intrinsic_instr *instr = nir_instr_as_intrinsic(use_src->parent_instr);
+			if (instr->intrinsic == nir_intrinsic_store_deref)
+				return true;
+		} else if (use_src->parent_instr->type == nir_instr_type_alu) {
+			nir_alu_instr *instr = nir_instr_as_alu(use_src->parent_instr);
+			if (instr->op == nir_op_vec4 &&
+			    is_def_used_in_an_export(&instr->dest.dest.ssa)) {
+				return true;
+			}
+		}
+	}
+	return false;
+}
+
 static void visit_ssa_undef(struct ac_nir_context *ctx,
 			    const nir_ssa_undef_instr *instr)
 {
 	unsigned num_components = instr->def.num_components;
 	LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
-	LLVMValueRef undef;
 
-	if (num_components == 1)
-		undef = LLVMGetUndef(type);
-	else {
-		undef = LLVMGetUndef(LLVMVectorType(type, num_components));
+	if (!ctx->abi->convert_undef_to_zero || is_def_used_in_an_export(&instr->def)) {
+		LLVMValueRef undef;
+
+		if (num_components == 1)
+			undef = LLVMGetUndef(type);
+		else {
+			undef = LLVMGetUndef(LLVMVectorType(type, num_components));
+		}
+		ctx->ssa_defs[instr->def.index] = undef;
+	} else {
+		LLVMValueRef zero = LLVMConstInt(type, 0, false);
+		if (num_components > 1) {
+			zero = ac_build_gather_values_extended(
+				&ctx->ac, &zero, 4, 0, false, false);
+		}
+		ctx->ssa_defs[instr->def.index] = zero;
 	}
-	ctx->ssa_defs[instr->def.index] = undef;
 }
 
 static void visit_jump(struct ac_llvm_context *ctx,
@@ -4491,7 +5160,7 @@ static void visit_deref(struct ac_nir_context *ctx,
 		break;
 	case nir_deref_type_ptr_as_array:
 		if (instr->mode == nir_var_mem_global) {
-			unsigned stride = nir_deref_instr_ptr_as_array_stride(instr);
+			unsigned stride = nir_deref_instr_array_stride(instr);
 
 			LLVMValueRef index = get_src(ctx, instr->arr.index);
 			if (LLVMTypeOf(index) != ctx->ac.i64)
@@ -4694,7 +5363,7 @@ setup_locals(struct ac_nir_context *ctx,
 {
 	int i, j;
 	ctx->num_locals = 0;
-	nir_foreach_variable(variable, &func->impl->locals) {
+	nir_foreach_function_temp_variable(variable, func->impl) {
 		unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
 		variable->data.driver_location = ctx->num_locals * 4;
 		variable->data.location_frac = 0;
@@ -4794,9 +5463,13 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
 
 	ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
 
-	nir_foreach_variable(variable, &nir->outputs)
-		ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable,
-					     ctx.stage);
+	/* TODO: remove this after RADV switches to lowered IO */
+	if (!nir->info.io_lowered) {
+		nir_foreach_shader_out_variable(variable, nir) {
+			ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable,
+						     ctx.stage);
+		}
+	}
 
 	ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
 	                                   _mesa_key_pointer_equal);
@@ -4805,6 +5478,10 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
 	ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
 	                                   _mesa_key_pointer_equal);
 
+        if (ctx.abi->kill_ps_if_inf_interp)
+                ctx.verified_interp = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+                                                              _mesa_key_pointer_equal);
+
 	func = (struct nir_function *)exec_list_get_head(&nir->functions);
 
 	nir_index_ssa_defs(func->impl);
@@ -4817,9 +5494,19 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
 	if (gl_shader_stage_is_compute(nir->info.stage))
 		setup_shared(&ctx, nir);
 
+	if (nir->info.stage == MESA_SHADER_FRAGMENT && nir->info.fs.uses_demote) {
+		ctx.ac.postponed_kill = ac_build_alloca_undef(&ctx.ac, ac->i1, "");
+		/* true = don't kill. */
+		LLVMBuildStore(ctx.ac.builder, ctx.ac.i1true, ctx.ac.postponed_kill);
+	}
+
 	visit_cf_list(&ctx, &func->impl->body);
 	phi_post_pass(&ctx);
 
+	if (ctx.ac.postponed_kill)
+		ac_build_kill_if_false(&ctx.ac, LLVMBuildLoad(ctx.ac.builder,
+							      ctx.ac.postponed_kill, ""));
+
 	if (!gl_shader_stage_is_compute(nir->info.stage))
 		ctx.abi->emit_outputs(ctx.abi, AC_LLVM_MAX_OUTPUTS,
 				      ctx.abi->outputs);
@@ -4829,6 +5516,8 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
 	ralloc_free(ctx.defs);
 	ralloc_free(ctx.phis);
 	ralloc_free(ctx.vars);
+        if (ctx.abi->kill_ps_if_inf_interp)
+                ralloc_free(ctx.verified_interp);
 }
 
 bool
@@ -4875,33 +5564,26 @@ ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class chip_class)
 	 */
 	indirect_mask |= nir_var_function_temp;
 
-	progress |= nir_lower_indirect_derefs(nir, indirect_mask);
+	progress |= nir_lower_indirect_derefs(nir, indirect_mask, UINT32_MAX);
 	return progress;
 }
 
 static unsigned
 get_inst_tessfactor_writemask(nir_intrinsic_instr *intrin)
 {
-	if (intrin->intrinsic != nir_intrinsic_store_deref)
+	if (intrin->intrinsic != nir_intrinsic_store_output)
 		return 0;
 
-	nir_variable *var =
-		nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[0]));
-
-	if (var->data.mode != nir_var_shader_out)
-		return 0;
+	unsigned writemask = nir_intrinsic_write_mask(intrin) <<
+			     nir_intrinsic_component(intrin);
+	unsigned location = nir_intrinsic_io_semantics(intrin).location;
 
-	unsigned writemask = 0;
-	const int location = var->data.location;
-	unsigned first_component = var->data.location_frac;
-	unsigned num_comps = intrin->dest.ssa.num_components;
+	if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
+		return writemask << 4;
+	else if (location == VARYING_SLOT_TESS_LEVEL_INNER)
+		return writemask;
 
-	if (location == VARYING_SLOT_TESS_LEVEL_INNER)
-		writemask = ((1 << (num_comps + 1)) - 1) << first_component;
-	else if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
-		writemask = (((1 << (num_comps + 1)) - 1) << first_component) << 4;
-
-	return writemask;
+	return 0;
 }
 
 static void