X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;ds=sidebyside;f=src%2Famd%2Fcommon%2Fac_nir_to_llvm.c;h=6467ed66ae5454a817d51009fb6603290b538a8d;hb=e6378962ce43727056756a373f5001da041b160e;hp=3b705b4b40dfc7ba5773a570d342b5517afbe320;hpb=5081fd398e52c365f326870bee9155a31e4d99ec;p=mesa.git

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 3b705b4b40d..6467ed66ae5 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -111,10 +111,7 @@ struct nir_to_llvm_context {
 	LLVMValueRef oc_lds;
 	LLVMValueRef merged_wave_info;
 	LLVMValueRef tess_factor_offset;
-	LLVMValueRef tcs_patch_id;
-	LLVMValueRef tcs_rel_ids;
 	LLVMValueRef tes_rel_patch_id;
-	LLVMValueRef tes_patch_id;
 	LLVMValueRef tes_u;
 	LLVMValueRef tes_v;
 
@@ -404,7 +401,7 @@ static LLVMValueRef get_rel_patch_id(struct nir_to_llvm_context *ctx)
 {
 	switch (ctx->stage) {
 	case MESA_SHADER_TESS_CTRL:
-		return unpack_param(&ctx->ac, ctx->tcs_rel_ids, 0, 8);
+		return unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8);
 	case MESA_SHADER_TESS_EVAL:
 		return ctx->tes_rel_patch_id;
 		break;
@@ -504,19 +501,26 @@ get_tcs_out_current_patch_data_offset(struct nir_to_llvm_context *ctx)
 			    "");
 }
 
-static void set_userdata_location(struct ac_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs)
+static void
+set_loc(struct ac_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs,
+	uint32_t indirect_offset)
 {
 	ud_info->sgpr_idx = *sgpr_idx;
 	ud_info->num_sgprs = num_sgprs;
-	ud_info->indirect = false;
-	ud_info->indirect_offset = 0;
+	ud_info->indirect = indirect_offset > 0;
+	ud_info->indirect_offset = indirect_offset;
 	*sgpr_idx += num_sgprs;
 }
 
-static void set_userdata_location_shader(struct nir_to_llvm_context *ctx,
-					 int idx, uint8_t *sgpr_idx, uint8_t num_sgprs)
+static void
+set_loc_shader(struct nir_to_llvm_context *ctx, int idx, uint8_t *sgpr_idx,
+	       uint8_t num_sgprs)
 {
-	set_userdata_location(&ctx->shader_info->user_sgprs_locs.shader_data[idx], sgpr_idx, num_sgprs);
+	struct ac_userdata_info *ud_info =
+		&ctx->shader_info->user_sgprs_locs.shader_data[idx];
+	assert(ud_info);
+
+	set_loc(ud_info, sgpr_idx, num_sgprs, 0);
 }
 
 static void
@@ -527,10 +531,7 @@ set_loc_desc(struct nir_to_llvm_context *ctx, int idx,  uint8_t *sgpr_idx,
 		&ctx->shader_info->user_sgprs_locs.descriptor_sets[idx];
 	assert(ud_info);
 
-	ud_info->sgpr_idx = *sgpr_idx;
-	ud_info->num_sgprs = 2;
-	ud_info->indirect = indirect_offset > 0;
-	ud_info->indirect_offset = indirect_offset;
+	set_loc(ud_info, sgpr_idx, 2, indirect_offset);
 }
 
 struct user_sgpr_info {
@@ -540,19 +541,20 @@ struct user_sgpr_info {
 };
 
 static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
+				gl_shader_stage stage,
 				struct user_sgpr_info *user_sgpr_info)
 {
 	memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info));
 
 	/* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
-	if (ctx->stage == MESA_SHADER_GEOMETRY ||
-	    ctx->stage == MESA_SHADER_VERTEX ||
-	    ctx->stage == MESA_SHADER_TESS_CTRL ||
-	    ctx->stage == MESA_SHADER_TESS_EVAL ||
+	if (stage == MESA_SHADER_GEOMETRY ||
+	    stage == MESA_SHADER_VERTEX ||
+	    stage == MESA_SHADER_TESS_CTRL ||
+	    stage == MESA_SHADER_TESS_EVAL ||
 	    ctx->is_gs_copy_shader)
 		user_sgpr_info->need_ring_offsets = true;
 
-	if (ctx->stage == MESA_SHADER_FRAGMENT &&
+	if (stage == MESA_SHADER_FRAGMENT &&
 	    ctx->shader_info->info.ps.needs_sample_positions)
 		user_sgpr_info->need_ring_offsets = true;
 
@@ -561,7 +563,8 @@ static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
 		user_sgpr_info->sgpr_count += 2;
 	}
 
-	switch (ctx->stage) {
+	/* FIXME: fix the number of user sgprs for merged shaders on GFX9 */
+	switch (stage) {
 	case MESA_SHADER_COMPUTE:
 		if (ctx->shader_info->info.cs.uses_grid_size)
 			user_sgpr_info->sgpr_count += 3;
@@ -594,10 +597,12 @@ static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
 		break;
 	}
 
-	if (ctx->shader_info->info.needs_push_constants)
+	if (ctx->shader_info->info.loads_push_constants)
 		user_sgpr_info->sgpr_count += 2;
 
-	uint32_t remaining_sgprs = 16 - user_sgpr_info->sgpr_count;
+	uint32_t available_sgprs = ctx->options->chip_class >= GFX9 ? 32 : 16;
+	uint32_t remaining_sgprs = available_sgprs - user_sgpr_info->sgpr_count;
+
 	if (remaining_sgprs / 2 < util_bitcount(ctx->shader_info->info.desc_set_used_mask)) {
 		user_sgpr_info->sgpr_count += 2;
 		user_sgpr_info->indirect_all_descriptor_sets = true;
@@ -635,7 +640,7 @@ declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
 		add_array_arg(args, const_array(type, 32), desc_sets);
 	}
 
-	if (ctx->shader_info->info.needs_push_constants) {
+	if (ctx->shader_info->info.loads_push_constants) {
 		/* 1 for push constants and dynamic descriptors */
 		add_array_arg(args, type, &ctx->push_constants);
 	}
@@ -668,9 +673,14 @@ declare_vs_input_vgprs(struct nir_to_llvm_context *ctx, struct arg_info *args)
 {
 	add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.vertex_id);
 	if (!ctx->is_gs_copy_shader) {
-		add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id);
-		add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
-		add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+		if (ctx->options->key.vs.as_ls) {
+			add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id);
+			add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+		} else {
+			add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
+			add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
+		}
+		add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
 	}
 }
 
@@ -680,7 +690,7 @@ declare_tes_input_vgprs(struct nir_to_llvm_context *ctx, struct arg_info *args)
 	add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_u);
 	add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_v);
 	add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->tes_rel_patch_id);
-	add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->tes_patch_id);
+	add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id);
 }
 
 static void
@@ -704,9 +714,8 @@ set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
 				ctx->descriptor_sets[i] = NULL;
 		}
 	} else {
-		set_userdata_location_shader(ctx,
-					     AC_UD_INDIRECT_DESCRIPTOR_SETS,
-					     user_sgpr_idx, 2);
+		set_loc_shader(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS,
+			       user_sgpr_idx, 2);
 
 		for (unsigned i = 0; i < num_sets; ++i) {
 			if (ctx->options->layout->set[i].layout->shader_stages & stage_mask) {
@@ -722,9 +731,8 @@ set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
 		ctx->shader_info->need_indirect_descriptor_sets = true;
 	}
 
-	if (ctx->shader_info->info.needs_push_constants) {
-		set_userdata_location_shader(ctx, AC_UD_PUSH_CONSTANTS,
-					     user_sgpr_idx, 2);
+	if (ctx->shader_info->info.loads_push_constants) {
+		set_loc_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
 	}
 }
 
@@ -738,18 +746,16 @@ set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
 	    (stage == MESA_SHADER_VERTEX ||
 	     (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
 		if (ctx->shader_info->info.vs.has_vertex_buffers) {
-			set_userdata_location_shader(ctx,
-						     AC_UD_VS_VERTEX_BUFFERS,
-						     user_sgpr_idx, 2);
+			set_loc_shader(ctx, AC_UD_VS_VERTEX_BUFFERS,
+				       user_sgpr_idx, 2);
 		}
 
 		unsigned vs_num = 2;
 		if (ctx->shader_info->info.vs.needs_draw_id)
 			vs_num++;
 
-		set_userdata_location_shader(ctx,
-					     AC_UD_VS_BASE_VERTEX_START_INSTANCE,
-					     user_sgpr_idx, vs_num);
+		set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE,
+			       user_sgpr_idx, vs_num);
 	}
 }
 
@@ -763,7 +769,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
 	struct arg_info args = {};
 	LLVMValueRef desc_sets;
 
-	allocate_user_sgprs(ctx, &user_sgpr_info);
+	allocate_user_sgprs(ctx, stage, &user_sgpr_info);
 
 	if (user_sgpr_info.need_ring_offsets && !ctx->options->supports_spill) {
 		add_arg(&args, ARG_SGPR, const_array(ctx->ac.v4i32, 16),
@@ -850,9 +856,9 @@ static void create_function(struct nir_to_llvm_context *ctx,
 					&ctx->view_index);
 
 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
-				&ctx->tcs_patch_id);
+				&ctx->abi.tcs_patch_id);
 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
-				&ctx->tcs_rel_ids);
+				&ctx->abi.tcs_rel_ids);
 
 			declare_vs_input_vgprs(ctx, &args);
 		} else {
@@ -878,9 +884,9 @@ static void create_function(struct nir_to_llvm_context *ctx,
 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
 				&ctx->tess_factor_offset);
 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
-				&ctx->tcs_patch_id);
+				&ctx->abi.tcs_patch_id);
 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
-				&ctx->tcs_rel_ids);
+				&ctx->abi.tcs_rel_ids);
 		}
 		break;
 	case MESA_SHADER_TESS_EVAL:
@@ -1042,7 +1048,8 @@ static void create_function(struct nir_to_llvm_context *ctx,
 	user_sgpr_idx = 0;
 
 	if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets) {
-		set_userdata_location_shader(ctx, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_idx, 2);
+		set_loc_shader(ctx, AC_UD_SCRATCH_RING_OFFSETS,
+			       &user_sgpr_idx, 2);
 		if (ctx->options->supports_spill) {
 			ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
 							       LLVMPointerType(ctx->ac.i8, CONST_ADDR_SPACE),
@@ -1063,17 +1070,18 @@ static void create_function(struct nir_to_llvm_context *ctx,
 	switch (stage) {
 	case MESA_SHADER_COMPUTE:
 		if (ctx->shader_info->info.cs.uses_grid_size) {
-			set_userdata_location_shader(ctx, AC_UD_CS_GRID_SIZE,
-						     &user_sgpr_idx, 3);
+			set_loc_shader(ctx, AC_UD_CS_GRID_SIZE,
+				       &user_sgpr_idx, 3);
 		}
 		break;
 	case MESA_SHADER_VERTEX:
 		set_vs_specific_input_locs(ctx, stage, has_previous_stage,
 					   previous_stage, &user_sgpr_idx);
 		if (ctx->view_index)
-			set_userdata_location_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
+			set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
 		if (ctx->options->key.vs.as_ls) {
-			set_userdata_location_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT, &user_sgpr_idx, 1);
+			set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT,
+				       &user_sgpr_idx, 1);
 		}
 		if (ctx->options->key.vs.as_ls)
 			ac_declare_lds_as_pointer(&ctx->ac);
@@ -1082,16 +1090,17 @@ static void create_function(struct nir_to_llvm_context *ctx,
 		set_vs_specific_input_locs(ctx, stage, has_previous_stage,
 					   previous_stage, &user_sgpr_idx);
 		if (has_previous_stage)
-			set_userdata_location_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT, &user_sgpr_idx, 1);
-		set_userdata_location_shader(ctx, AC_UD_TCS_OFFCHIP_LAYOUT, &user_sgpr_idx, 4);
+			set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT,
+				       &user_sgpr_idx, 1);
+		set_loc_shader(ctx, AC_UD_TCS_OFFCHIP_LAYOUT, &user_sgpr_idx, 4);
 		if (ctx->view_index)
-			set_userdata_location_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
+			set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
 		ac_declare_lds_as_pointer(&ctx->ac);
 		break;
 	case MESA_SHADER_TESS_EVAL:
-		set_userdata_location_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT, &user_sgpr_idx, 1);
+		set_loc_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT, &user_sgpr_idx, 1);
 		if (ctx->view_index)
-			set_userdata_location_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
+			set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
 		break;
 	case MESA_SHADER_GEOMETRY:
 		if (has_previous_stage) {
@@ -1101,17 +1110,20 @@ static void create_function(struct nir_to_llvm_context *ctx,
 							   previous_stage,
 							   &user_sgpr_idx);
 			else
-				set_userdata_location_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT, &user_sgpr_idx, 1);
+				set_loc_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT,
+					       &user_sgpr_idx, 1);
 		}
-		set_userdata_location_shader(ctx, AC_UD_GS_VS_RING_STRIDE_ENTRIES, &user_sgpr_idx, 2);
+		set_loc_shader(ctx, AC_UD_GS_VS_RING_STRIDE_ENTRIES,
+			       &user_sgpr_idx, 2);
 		if (ctx->view_index)
-			set_userdata_location_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
+			set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
 		if (has_previous_stage)
 			ac_declare_lds_as_pointer(&ctx->ac);
 		break;
 	case MESA_SHADER_FRAGMENT:
 		if (ctx->shader_info->info.ps.needs_sample_positions) {
-			set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS_OFFSET, &user_sgpr_idx, 1);
+			set_loc_shader(ctx, AC_UD_PS_SAMPLE_POS_OFFSET,
+				       &user_sgpr_idx, 1);
 		}
 		break;
 	default:
@@ -1121,32 +1133,10 @@ static void create_function(struct nir_to_llvm_context *ctx,
 	ctx->shader_info->num_user_sgprs = user_sgpr_idx;
 }
 
-static int get_llvm_num_components(LLVMValueRef value)
-{
-	LLVMTypeRef type = LLVMTypeOf(value);
-	unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
-	                              ? LLVMGetVectorSize(type)
-	                              : 1;
-	return num_components;
-}
-
-static LLVMValueRef llvm_extract_elem(struct ac_llvm_context *ac,
-				      LLVMValueRef value,
-				      int index)
-{
-	int count = get_llvm_num_components(value);
-
-	if (count == 1)
-		return value;
-
-	return LLVMBuildExtractElement(ac->builder, value,
-				       LLVMConstInt(ac->i32, index, false), "");
-}
-
 static LLVMValueRef trim_vector(struct ac_llvm_context *ctx,
                                 LLVMValueRef value, unsigned count)
 {
-	unsigned num_components = get_llvm_num_components(value);
+	unsigned num_components = ac_get_llvm_num_components(value);
 	if (count == num_components)
 		return value;
 
@@ -1348,26 +1338,49 @@ static LLVMValueRef emit_iabs(struct ac_llvm_context *ctx,
 }
 
 static LLVMValueRef emit_fsign(struct ac_llvm_context *ctx,
-			       LLVMValueRef src0)
+			       LLVMValueRef src0,
+			       unsigned bitsize)
 {
-	LLVMValueRef cmp, val;
+	LLVMValueRef cmp, val, zero, one;
+	LLVMTypeRef type;
+
+	if (bitsize == 32) {
+		type = ctx->f32;
+		zero = ctx->f32_0;
+		one = ctx->f32_1;
+	} else {
+		type = ctx->f64;
+		zero = ctx->f64_0;
+		one = ctx->f64_1;
+	}
 
-	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, ctx->f32_0, "");
-	val = LLVMBuildSelect(ctx->builder, cmp, ctx->f32_1, src0, "");
-	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, ctx->f32_0, "");
-	val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(ctx->f32, -1.0), "");
+	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
+	val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
+	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
+	val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
 	return val;
 }
 
 static LLVMValueRef emit_isign(struct ac_llvm_context *ctx,
-			       LLVMValueRef src0)
+			       LLVMValueRef src0, unsigned bitsize)
 {
-	LLVMValueRef cmp, val;
+	LLVMValueRef cmp, val, zero, one;
+	LLVMTypeRef type;
 
-	cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, ctx->i32_0, "");
-	val = LLVMBuildSelect(ctx->builder, cmp, ctx->i32_1, src0, "");
-	cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, ctx->i32_0, "");
-	val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(ctx->i32, -1, true), "");
+	if (bitsize == 32) {
+		type = ctx->i32;
+		zero = ctx->i32_0;
+		one = ctx->i32_1;
+	} else {
+		type = ctx->i64;
+		zero = ctx->i64_0;
+		one = ctx->i64_1;
+	}
+
+	cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
+	val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
+	cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
+	val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
 	return val;
 }
 
@@ -1420,9 +1433,15 @@ static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx,
 }
 
 static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx,
-			     LLVMValueRef src0)
+			     LLVMValueRef src0,
+			     unsigned bitsize)
 {
-	return LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, "");
+	LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, "");
+
+	if (bitsize == 32)
+		return result;
+
+	return LLVMBuildZExt(ctx->builder, result, ctx->i64, "");
 }
 
 static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx,
@@ -1725,7 +1744,8 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		break;
 	case nir_op_frcp:
 		src[0] = ac_to_float(&ctx->ac, src[0]);
-		result = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, src[0]);
+		result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size == 32 ? ctx->ac.f32_1 : ctx->ac.f64_1,
+				       src[0]);
 		break;
 	case nir_op_iand:
 		result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
@@ -1804,11 +1824,11 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		result = emit_minmax_int(&ctx->ac, LLVMIntULT, src[0], src[1]);
 		break;
 	case nir_op_isign:
-		result = emit_isign(&ctx->ac, src[0]);
+		result = emit_isign(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
 		break;
 	case nir_op_fsign:
 		src[0] = ac_to_float(&ctx->ac, src[0]);
-		result = emit_fsign(&ctx->ac, src[0]);
+		result = emit_fsign(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
 		break;
 	case nir_op_ffloor:
 		result = emit_intrin_1f_param(&ctx->ac, "llvm.floor",
@@ -1852,7 +1872,8 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 	case nir_op_frsq:
 		result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
 		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
-		result = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, result);
+		result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size == 32 ? ctx->ac.f32_1 : ctx->ac.f64_1,
+				       result);
 		break;
 	case nir_op_fpow:
 		result = emit_intrin_2f_param(&ctx->ac, "llvm.pow",
@@ -1975,7 +1996,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		result = emit_f2b(&ctx->ac, src[0]);
 		break;
 	case nir_op_b2i:
-		result = emit_b2i(&ctx->ac, src[0]);
+		result = emit_b2i(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
 		break;
 	case nir_op_i2b:
 		src[0] = ac_to_integer(&ctx->ac, src[0]);
@@ -2271,7 +2292,9 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
 	case nir_texop_txf:
 	case nir_texop_txf_ms:
 	case nir_texop_samples_identical:
-		args->opcode = instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? ac_image_load : ac_image_load_mip;
+		args->opcode = lod_is_zero ||
+			       instr->sampler_dim == GLSL_SAMPLER_DIM_MS ?
+					ac_image_load : ac_image_load_mip;
 		args->compare = false;
 		args->offset = false;
 		break;
@@ -2447,7 +2470,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
 
 		} else {
 			assert(count == 1);
-			if (get_llvm_num_components(base_data) > 1)
+			if (ac_get_llvm_num_components(base_data) > 1)
 				data = LLVMBuildExtractElement(ctx->ac.builder, base_data,
 							       LLVMConstInt(ctx->ac.i32, start, false), "");
 			else
@@ -2474,9 +2497,9 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
 	int arg_count = 0;
 
 	if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
-		params[arg_count++] = llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
+		params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
 	}
-	params[arg_count++] = llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
+	params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
 	params[arg_count++] = ctx->abi->load_ssbo(ctx->abi,
 						 get_src(ctx, instr->src[0]),
 						 true);
@@ -2588,7 +2611,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
 static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
                                           const nir_intrinsic_instr *instr)
 {
-	LLVMValueRef results[8], ret;
+	LLVMValueRef ret;
 	LLVMValueRef rsrc = get_src(ctx, instr->src[0]);
 	LLVMValueRef offset = get_src(ctx, instr->src[1]);
 	int num_components = instr->num_components;
@@ -2599,20 +2622,9 @@ static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
 	if (instr->dest.ssa.bit_size == 64)
 		num_components *= 2;
 
-	for (unsigned i = 0; i < num_components; ++i) {
-		LLVMValueRef params[] = {
-			rsrc,
-			LLVMBuildAdd(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, 4 * i, 0),
-				     offset, "")
-		};
-		results[i] = ac_build_intrinsic(&ctx->ac, "llvm.SI.load.const.v4i32", ctx->ac.f32,
-						params, 2,
-						AC_FUNC_ATTR_READNONE |
-						AC_FUNC_ATTR_LEGACY);
-	}
-
-
-	ret = ac_build_gather_values(&ctx->ac, results, num_components);
+	ret = ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset,
+				   NULL, 0, false, false, true, true);
+	ret = trim_vector(&ctx->ac, ret, num_components);
 	return LLVMBuildBitCast(ctx->ac.builder, ret,
 	                        get_def_type(ctx, &instr->dest.ssa), "");
 }
@@ -2826,35 +2838,33 @@ get_dw_address(struct nir_to_llvm_context *ctx,
 }
 
 static LLVMValueRef
-load_tcs_input(struct nir_to_llvm_context *ctx,
-	       nir_intrinsic_instr *instr)
+load_tcs_input(struct ac_shader_abi *abi,
+	       LLVMValueRef vertex_index,
+	       LLVMValueRef indir_index,
+	       unsigned const_index,
+	       unsigned location,
+	       unsigned driver_location,
+	       unsigned component,
+	       unsigned num_components,
+	       bool is_patch,
+	       bool is_compact)
 {
+	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
 	LLVMValueRef dw_addr, stride;
-	unsigned const_index;
-	LLVMValueRef vertex_index;
-	LLVMValueRef indir_index;
-	unsigned param;
 	LLVMValueRef value[4], result;
-	const bool per_vertex = nir_is_per_vertex_io(instr->variables[0]->var, ctx->stage);
-	const bool is_compact = instr->variables[0]->var->data.compact;
-	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
-	get_deref_offset(ctx->nir, instr->variables[0],
-			 false, NULL, per_vertex ? &vertex_index : NULL,
-			 &const_index, &indir_index);
+	unsigned param = shader_io_get_unique_index(location);
 
 	stride = unpack_param(&ctx->ac, ctx->tcs_in_layout, 13, 8);
 	dw_addr = get_tcs_in_current_patch_offset(ctx);
 	dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride,
 				 indir_index);
 
-	unsigned comp = instr->variables[0]->var->data.location_frac;
-	for (unsigned i = 0; i < instr->num_components + comp; i++) {
+	for (unsigned i = 0; i < num_components + component; i++) {
 		value[i] = ac_lds_load(&ctx->ac, dw_addr);
 		dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
 				       ctx->ac.i32_1, "");
 	}
-	result = ac_build_varying_gather_values(&ctx->ac, value, instr->num_components, comp);
-	result = LLVMBuildBitCast(ctx->builder, result, get_def_type(ctx->nir, &instr->dest.ssa), "");
+	result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
 	return result;
 }
 
@@ -2898,65 +2908,64 @@ load_tcs_output(struct nir_to_llvm_context *ctx,
 }
 
 static void
-store_tcs_output(struct nir_to_llvm_context *ctx,
-		 nir_intrinsic_instr *instr,
+store_tcs_output(struct ac_shader_abi *abi,
+		 LLVMValueRef vertex_index,
+		 LLVMValueRef param_index,
+		 unsigned const_index,
+		 unsigned location,
+		 unsigned driver_location,
 		 LLVMValueRef src,
+		 unsigned component,
+		 bool is_patch,
+		 bool is_compact,
 		 unsigned writemask)
 {
+	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
 	LLVMValueRef dw_addr;
 	LLVMValueRef stride = NULL;
 	LLVMValueRef buf_addr = NULL;
-	LLVMValueRef vertex_index = NULL;
-	LLVMValueRef indir_index = NULL;
-	unsigned const_index = 0;
 	unsigned param;
-	const unsigned comp = instr->variables[0]->var->data.location_frac;
-	const bool per_vertex = nir_is_per_vertex_io(instr->variables[0]->var, ctx->stage);
-	const bool is_compact = instr->variables[0]->var->data.compact;
 	bool store_lds = true;
 
-	if (instr->variables[0]->var->data.patch) {
-		if (!(ctx->tcs_patch_outputs_read & (1U << (instr->variables[0]->var->data.location - VARYING_SLOT_PATCH0))))
+	if (is_patch) {
+		if (!(ctx->tcs_patch_outputs_read & (1U << (location - VARYING_SLOT_PATCH0))))
 			store_lds = false;
 	} else {
-		if (!(ctx->tcs_outputs_read & (1ULL << instr->variables[0]->var->data.location)))
+		if (!(ctx->tcs_outputs_read & (1ULL << location)))
 			store_lds = false;
 	}
-	get_deref_offset(ctx->nir, instr->variables[0],
-			 false, NULL, per_vertex ? &vertex_index : NULL,
-			 &const_index, &indir_index);
 
-	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
-	if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 &&
+	param = shader_io_get_unique_index(location);
+	if (location == VARYING_SLOT_CLIP_DIST0 &&
 	    is_compact && const_index > 3) {
 		const_index -= 3;
 		param++;
 	}
 
-	if (!instr->variables[0]->var->data.patch) {
+	if (!is_patch) {
 		stride = unpack_param(&ctx->ac, ctx->tcs_out_layout, 13, 8);
 		dw_addr = get_tcs_out_current_patch_offset(ctx);
 	} else {
 		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
 	}
 
-	mark_tess_output(ctx, instr->variables[0]->var->data.patch, param);
+	mark_tess_output(ctx, is_patch, param);
 
 	dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride,
-				 indir_index);
+				 param_index);
 	buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index, is_compact,
-						     vertex_index, indir_index);
+						     vertex_index, param_index);
 
 	bool is_tess_factor = false;
-	if (instr->variables[0]->var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
-	    instr->variables[0]->var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
+	if (location == VARYING_SLOT_TESS_LEVEL_INNER ||
+	    location == VARYING_SLOT_TESS_LEVEL_OUTER)
 		is_tess_factor = true;
 
 	unsigned base = is_compact ? const_index : 0;
 	for (unsigned chan = 0; chan < 8; chan++) {
 		if (!(writemask & (1 << chan)))
 			continue;
-		LLVMValueRef value = llvm_extract_elem(&ctx->ac, src, chan - comp);
+		LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
 
 		if (store_lds || is_tess_factor)
 			ac_lds_store(&ctx->ac, dw_addr, value);
@@ -2978,39 +2987,36 @@ store_tcs_output(struct nir_to_llvm_context *ctx,
 }
 
 static LLVMValueRef
-load_tes_input(struct nir_to_llvm_context *ctx,
-	       const nir_intrinsic_instr *instr)
+load_tes_input(struct ac_shader_abi *abi,
+	       LLVMValueRef vertex_index,
+	       LLVMValueRef param_index,
+	       unsigned const_index,
+	       unsigned location,
+	       unsigned driver_location,
+	       unsigned component,
+	       unsigned num_components,
+	       bool is_patch,
+	       bool is_compact)
 {
+	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
 	LLVMValueRef buf_addr;
 	LLVMValueRef result;
-	LLVMValueRef vertex_index = NULL;
-	LLVMValueRef indir_index = NULL;
-	unsigned const_index = 0;
-	unsigned param;
-	const bool per_vertex = nir_is_per_vertex_io(instr->variables[0]->var, ctx->stage);
-	const bool is_compact = instr->variables[0]->var->data.compact;
+	unsigned param = shader_io_get_unique_index(location);
 
-	get_deref_offset(ctx->nir, instr->variables[0],
-			 false, NULL, per_vertex ? &vertex_index : NULL,
-			 &const_index, &indir_index);
-	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
-	if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 &&
-	    is_compact && const_index > 3) {
+	if (location == VARYING_SLOT_CLIP_DIST0 && is_compact && const_index > 3) {
 		const_index -= 3;
 		param++;
 	}
 
-	unsigned comp = instr->variables[0]->var->data.location_frac;
 	buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index,
-						     is_compact, vertex_index, indir_index);
+						     is_compact, vertex_index, param_index);
 
-	LLVMValueRef comp_offset = LLVMConstInt(ctx->ac.i32, comp * 4, false);
+	LLVMValueRef comp_offset = LLVMConstInt(ctx->ac.i32, component * 4, false);
 	buf_addr = LLVMBuildAdd(ctx->builder, buf_addr, comp_offset, "");
 
-	result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, instr->num_components, NULL,
+	result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, num_components, NULL,
 				      buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, 1, 0, true, false);
-	result = trim_vector(&ctx->ac, result, instr->num_components);
-	result = LLVMBuildBitCast(ctx->builder, result, get_def_type(ctx->nir, &instr->dest.ssa), "");
+	result = trim_vector(&ctx->ac, result, num_components);
 	return result;
 }
 
@@ -3114,6 +3120,7 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 	LLVMValueRef indir_index;
 	LLVMValueRef ret;
 	unsigned const_index;
+	unsigned stride = instr->variables[0]->var->data.compact ? 1 : 4;
 	bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
 	             instr->variables[0]->var->data.mode == nir_var_shader_in;
 	get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL,
@@ -3124,10 +3131,29 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 
 	switch (instr->variables[0]->var->data.mode) {
 	case nir_var_shader_in:
-		if (ctx->stage == MESA_SHADER_TESS_CTRL)
-			return load_tcs_input(ctx->nctx, instr);
-		if (ctx->stage == MESA_SHADER_TESS_EVAL)
-			return load_tes_input(ctx->nctx, instr);
+		if (ctx->stage == MESA_SHADER_TESS_CTRL ||
+		    ctx->stage == MESA_SHADER_TESS_EVAL) {
+			LLVMValueRef result;
+			LLVMValueRef vertex_index = NULL;
+			LLVMValueRef indir_index = NULL;
+			unsigned const_index = 0;
+			unsigned location = instr->variables[0]->var->data.location;
+			unsigned driver_location = instr->variables[0]->var->data.driver_location;
+			const bool is_patch =  instr->variables[0]->var->data.patch;
+			const bool is_compact = instr->variables[0]->var->data.compact;
+
+			get_deref_offset(ctx, instr->variables[0],
+					 false, NULL, is_patch ? NULL : &vertex_index,
+					 &const_index, &indir_index);
+
+			result = ctx->abi->load_tess_inputs(ctx->abi, vertex_index, indir_index,
+							    const_index, location, driver_location,
+							    instr->variables[0]->var->data.location_frac,
+							    instr->num_components,
+							    is_patch, is_compact);
+			return LLVMBuildBitCast(ctx->ac.builder, result, get_def_type(ctx, &instr->dest.ssa), "");
+		}
+
 		if (ctx->stage == MESA_SHADER_GEOMETRY) {
 				LLVMValueRef indir_index;
 				unsigned const_index, vertex_index;
@@ -3149,13 +3175,13 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 				count -= chan / 4;
 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
 						&ctx->ac, ctx->abi->inputs + idx + chan, count,
-						4, false, true);
+						stride, false, true);
 
 				values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
 								       tmp_vec,
 								       indir_index, "");
 			} else
-				values[chan] = ctx->abi->inputs[idx + chan + const_index * 4];
+				values[chan] = ctx->abi->inputs[idx + chan + const_index * stride];
 		}
 		break;
 	case nir_var_local:
@@ -3166,13 +3192,13 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 				count -= chan / 4;
 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
 						&ctx->ac, ctx->locals + idx + chan, count,
-						4, true, true);
+						stride, true, true);
 
 				values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
 								       tmp_vec,
 								       indir_index, "");
 			} else {
-				values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->locals[idx + chan + const_index * 4], "");
+				values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->locals[idx + chan + const_index * stride], "");
 			}
 		}
 		break;
@@ -3195,14 +3221,14 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 				count -= chan / 4;
 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
 						&ctx->ac, ctx->outputs + idx + chan, count,
-						4, true, true);
+						stride, true, true);
 
 				values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
 								       tmp_vec,
 								       indir_index, "");
 			} else {
 				values[chan] = LLVMBuildLoad(ctx->ac.builder,
-						     ctx->outputs[idx + chan + const_index * 4],
+						     ctx->outputs[idx + chan + const_index * stride],
 						     "");
 			}
 		}
@@ -3232,7 +3258,7 @@ visit_store_var(struct ac_nir_context *ctx,
 		int old_writemask = writemask;
 
 		src = LLVMBuildBitCast(ctx->ac.builder, src,
-		                       LLVMVectorType(ctx->ac.f32, get_llvm_num_components(src) * 2),
+		                       LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2),
 		                       "");
 
 		writemask = 0;
@@ -3246,7 +3272,22 @@ visit_store_var(struct ac_nir_context *ctx,
 	case nir_var_shader_out:
 
 		if (ctx->stage == MESA_SHADER_TESS_CTRL) {
-			store_tcs_output(ctx->nctx, instr, src, writemask);
+			LLVMValueRef vertex_index = NULL;
+			LLVMValueRef indir_index = NULL;
+			unsigned const_index = 0;
+			const unsigned location = instr->variables[0]->var->data.location;
+			const unsigned driver_location = instr->variables[0]->var->data.driver_location;
+			const unsigned comp = instr->variables[0]->var->data.location_frac;
+			const bool is_patch = instr->variables[0]->var->data.patch;
+			const bool is_compact = instr->variables[0]->var->data.compact;
+
+			get_deref_offset(ctx, instr->variables[0],
+					 false, NULL, is_patch ? NULL : &vertex_index,
+					 &const_index, &indir_index);
+
+			ctx->abi->store_tcs_outputs(ctx->abi, vertex_index, indir_index,
+						    const_index, location, driver_location,
+						    src, comp, is_patch, is_compact, writemask);
 			return;
 		}
 
@@ -3255,7 +3296,7 @@ visit_store_var(struct ac_nir_context *ctx,
 			if (!(writemask & (1 << chan)))
 				continue;
 
-			value = llvm_extract_elem(&ctx->ac, src, chan - comp);
+			value = ac_llvm_extract_elem(&ctx->ac, src, chan - comp);
 
 			if (instr->variables[0]->var->data.compact)
 				stride = 1;
@@ -3284,7 +3325,7 @@ visit_store_var(struct ac_nir_context *ctx,
 			if (!(writemask & (1 << chan)))
 				continue;
 
-			value = llvm_extract_elem(&ctx->ac, src, chan);
+			value = ac_llvm_extract_elem(&ctx->ac, src, chan);
 			if (indir_index) {
 				unsigned count = glsl_count_attribute_slots(
 					instr->variables[0]->var->type, false);
@@ -3324,8 +3365,8 @@ visit_store_var(struct ac_nir_context *ctx,
 				LLVMValueRef ptr =
 					LLVMBuildStructGEP(ctx->ac.builder,
 							   address, chan, "");
-				LLVMValueRef src = llvm_extract_elem(&ctx->ac, val,
-								     chan);
+				LLVMValueRef src = ac_llvm_extract_elem(&ctx->ac, val,
+									chan);
 				src = LLVMBuildBitCast(
 				   ctx->ac.builder, src,
 				   LLVMGetElementType(LLVMTypeOf(ptr)), "");
@@ -3457,7 +3498,7 @@ static LLVMValueRef get_image_coords(struct ac_nir_context *ctx,
 		LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
 	};
 	LLVMValueRef res;
-	LLVMValueRef sample_index = llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[1]), 0);
+	LLVMValueRef sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[1]), 0);
 
 	int count;
 	enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
@@ -3504,7 +3545,7 @@ static LLVMValueRef get_image_coords(struct ac_nir_context *ctx,
 		if (is_ms)
 			count--;
 		for (chan = 0; chan < count; ++chan) {
-			coords[chan] = llvm_extract_elem(&ctx->ac, src0, chan);
+			coords[chan] = ac_llvm_extract_elem(&ctx->ac, src0, chan);
 		}
 		if (add_frag_pos) {
 			for (chan = 0; chan < 2; ++chan)
@@ -3812,19 +3853,18 @@ static void emit_membar(struct nir_to_llvm_context *ctx,
 		ac_build_waitcnt(&ctx->ac, waitcnt);
 }
 
-static void emit_barrier(struct nir_to_llvm_context *ctx)
+static void emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage)
 {
 	/* SI only (thanks to a hw bug workaround):
 	 * The real barrier instruction isnât needed, because an entire patch
 	 * always fits into a single wave.
 	 */
-	if (ctx->options->chip_class == SI &&
-	    ctx->stage == MESA_SHADER_TESS_CTRL) {
-		ac_build_waitcnt(&ctx->ac, LGKM_CNT & VM_CNT);
+	if (ac->chip_class == SI && stage == MESA_SHADER_TESS_CTRL) {
+		ac_build_waitcnt(ac, LGKM_CNT & VM_CNT);
 		return;
 	}
-	ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.s.barrier",
-			   ctx->ac.voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
+	ac_build_intrinsic(ac, "llvm.amdgcn.s.barrier",
+			   ac->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
 }
 
 static void emit_discard_if(struct ac_nir_context *ctx,
@@ -4137,9 +4177,11 @@ visit_end_primitive(struct nir_to_llvm_context *ctx,
 }
 
 static LLVMValueRef
-visit_load_tess_coord(struct nir_to_llvm_context *ctx,
-		      const nir_intrinsic_instr *instr)
+load_tess_coord(struct ac_shader_abi *abi, LLVMTypeRef type,
+		unsigned num_components)
 {
+	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
+
 	LLVMValueRef coord[4] = {
 		ctx->tes_u,
 		ctx->tes_v,
@@ -4151,9 +4193,15 @@ visit_load_tess_coord(struct nir_to_llvm_context *ctx,
 		coord[2] = LLVMBuildFSub(ctx->builder, ctx->ac.f32_1,
 					LLVMBuildFAdd(ctx->builder, coord[0], coord[1], ""), "");
 
-	LLVMValueRef result = ac_build_gather_values(&ctx->ac, coord, instr->num_components);
-	return LLVMBuildBitCast(ctx->builder, result,
-				get_def_type(ctx->nir, &instr->dest.ssa), "");
+	LLVMValueRef result = ac_build_gather_values(&ctx->ac, coord, num_components);
+	return LLVMBuildBitCast(ctx->builder, result, type, "");
+}
+
+static LLVMValueRef
+load_patch_vertices_in(struct ac_shader_abi *abi)
+{
+	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
+	return LLVMConstInt(ctx->ac.i32, ctx->options->key.tcs.input_vertices, false);
 }
 
 static void visit_intrinsic(struct ac_nir_context *ctx,
@@ -4196,21 +4244,17 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 		break;
 	case nir_intrinsic_load_invocation_id:
 		if (ctx->stage == MESA_SHADER_TESS_CTRL)
-			result = unpack_param(&ctx->ac, ctx->nctx->tcs_rel_ids, 8, 5);
+			result = unpack_param(&ctx->ac, ctx->abi->tcs_rel_ids, 8, 5);
 		else
 			result = ctx->abi->gs_invocation_id;
 		break;
 	case nir_intrinsic_load_primitive_id:
 		if (ctx->stage == MESA_SHADER_GEOMETRY) {
-			if (ctx->nctx)
-				ctx->nctx->shader_info->gs.uses_prim_id = true;
 			result = ctx->abi->gs_prim_id;
 		} else if (ctx->stage == MESA_SHADER_TESS_CTRL) {
-			ctx->nctx->shader_info->tcs.uses_prim_id = true;
-			result = ctx->nctx->tcs_patch_id;
+			result = ctx->abi->tcs_patch_id;
 		} else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
-			ctx->nctx->shader_info->tcs.uses_prim_id = true;
-			result = ctx->nctx->tes_patch_id;
+			result = ctx->abi->tes_patch_id;
 		} else
 			fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage);
 		break;
@@ -4320,7 +4364,7 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 		emit_membar(ctx->nctx, instr);
 		break;
 	case nir_intrinsic_barrier:
-		emit_barrier(ctx->nctx);
+		emit_barrier(&ctx->ac, ctx->stage);
 		break;
 	case nir_intrinsic_var_atomic_add:
 	case nir_intrinsic_var_atomic_imin:
@@ -4346,11 +4390,21 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 	case nir_intrinsic_end_primitive:
 		visit_end_primitive(ctx->nctx, instr);
 		break;
-	case nir_intrinsic_load_tess_coord:
-		result = visit_load_tess_coord(ctx->nctx, instr);
+	case nir_intrinsic_load_tess_coord: {
+		LLVMTypeRef type = ctx->nctx ?
+			get_def_type(ctx->nctx->nir, &instr->dest.ssa) :
+			NULL;
+		result = ctx->abi->load_tess_coord(ctx->abi, type, instr->num_components);
+		break;
+	}
+	case nir_intrinsic_load_tess_level_outer:
+		result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER);
+		break;
+	case nir_intrinsic_load_tess_level_inner:
+		result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER);
 		break;
 	case nir_intrinsic_load_patch_vertices_in:
-		result = LLVMConstInt(ctx->ac.i32, ctx->nctx->options->key.tcs.input_vertices, false);
+		result = ctx->abi->load_patch_vertices_in(ctx->abi);
 		break;
 	default:
 		fprintf(stderr, "Unknown intrinsic: ");
@@ -4702,7 +4756,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 
 	if (coord)
 		for (chan = 0; chan < instr->coord_components; chan++)
-			coords[chan] = llvm_extract_elem(&ctx->ac, coord, chan);
+			coords[chan] = ac_llvm_extract_elem(&ctx->ac, coord, chan);
 
 	if (offsets && instr->op != nir_texop_txf) {
 		LLVMValueRef offset[3], pack;
@@ -4710,8 +4764,8 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 			offset[chan] = ctx->ac.i32_0;
 
 		args.offset = true;
-		for (chan = 0; chan < get_llvm_num_components(offsets); chan++) {
-			offset[chan] = llvm_extract_elem(&ctx->ac, offsets, chan);
+		for (chan = 0; chan < ac_get_llvm_num_components(offsets); chan++) {
+			offset[chan] = ac_llvm_extract_elem(&ctx->ac, offsets, chan);
 			offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan],
 						    LLVMConstInt(ctx->ac.i32, 0x3f, false), "");
 			if (chan)
@@ -4731,7 +4785,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 	/* Pack depth comparison value */
 	if (instr->is_shadow && comparator) {
 		LLVMValueRef z = ac_to_float(&ctx->ac,
-		                             llvm_extract_elem(&ctx->ac, comparator, 0));
+		                             ac_llvm_extract_elem(&ctx->ac, comparator, 0));
 
 		/* TC-compatible HTILE on radeonsi promotes Z16 and Z24 to Z32_FLOAT,
 		 * so the depth comparison value isn't clamped for Z16 and
@@ -4775,8 +4829,8 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 		}
 
 		for (unsigned i = 0; i < num_src_deriv_channels; i++) {
-			derivs[i] = ac_to_float(&ctx->ac, llvm_extract_elem(&ctx->ac, ddx, i));
-			derivs[num_dest_deriv_channels + i] = ac_to_float(&ctx->ac, llvm_extract_elem(&ctx->ac, ddy, i));
+			derivs[i] = ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddx, i));
+			derivs[num_dest_deriv_channels + i] = ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddy, i));
 		}
 		for (unsigned i = num_src_deriv_channels; i < num_dest_deriv_channels; i++) {
 			derivs[i] = ctx->ac.f32_0;
@@ -5158,8 +5212,13 @@ handle_vs_input_decl(struct nir_to_llvm_context *ctx,
 	if (ctx->options->key.vs.instance_rate_inputs & (1u << index)) {
 		buffer_index = LLVMBuildAdd(ctx->builder, ctx->abi.instance_id,
 					    ctx->abi.start_instance, "");
-		ctx->shader_info->vs.vgpr_comp_cnt = MAX2(3,
-		                            ctx->shader_info->vs.vgpr_comp_cnt);
+		if (ctx->options->key.vs.as_ls) {
+			ctx->shader_info->vs.vgpr_comp_cnt =
+				MAX2(2, ctx->shader_info->vs.vgpr_comp_cnt);
+		} else {
+			ctx->shader_info->vs.vgpr_comp_cnt =
+				MAX2(1, ctx->shader_info->vs.vgpr_comp_cnt);
+		}
 	} else
 		buffer_index = LLVMBuildAdd(ctx->builder, ctx->abi.vertex_id,
 					    ctx->abi.base_vertex, "");
@@ -5535,6 +5594,7 @@ setup_locals(struct ac_nir_context *ctx,
 	nir_foreach_variable(variable, &func->impl->locals) {
 		unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
 		variable->data.driver_location = ctx->num_locals * 4;
+		variable->data.location_frac = 0;
 		ctx->num_locals += attrib_count;
 	}
 	ctx->locals = malloc(4 * ctx->num_locals * sizeof(LLVMValueRef));
@@ -6142,13 +6202,13 @@ write_tess_factors(struct nir_to_llvm_context *ctx)
 {
 	unsigned stride, outer_comps, inner_comps;
 	struct ac_build_if_state if_ctx, inner_if_ctx;
-	LLVMValueRef invocation_id = unpack_param(&ctx->ac, ctx->tcs_rel_ids, 8, 5);
-	LLVMValueRef rel_patch_id = unpack_param(&ctx->ac, ctx->tcs_rel_ids, 0, 8);
+	LLVMValueRef invocation_id = unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 8, 5);
+	LLVMValueRef rel_patch_id = unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8);
 	unsigned tess_inner_index, tess_outer_index;
 	LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
 	LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
 	int i;
-	emit_barrier(ctx);
+	emit_barrier(&ctx->ac, ctx->stage);
 
 	switch (ctx->options->key.tcs.primitive_mode) {
 	case GL_ISOLINES:
@@ -6532,8 +6592,8 @@ static void ac_nir_fixup_ls_hs_input_vgprs(struct nir_to_llvm_context *ctx)
 	                                      ctx->ac.i32_0, "");
 	ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->rel_auto_id, ctx->abi.instance_id, "");
 	ctx->vs_prim_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.vertex_id, ctx->vs_prim_id, "");
-	ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->tcs_rel_ids, ctx->rel_auto_id, "");
-	ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->tcs_patch_id, ctx->abi.vertex_id, "");
+	ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_rel_ids, ctx->rel_auto_id, "");
+	ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_patch_id, ctx->abi.vertex_id, "");
 }
 
 static void prepare_gs_input_vgprs(struct nir_to_llvm_context *ctx)
@@ -6675,19 +6735,31 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
 		} else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) {
 			ctx.tcs_outputs_read = shaders[i]->info.outputs_read;
 			ctx.tcs_patch_outputs_read = shaders[i]->info.patch_outputs_read;
+			ctx.abi.load_tess_inputs = load_tcs_input;
+			ctx.abi.load_patch_vertices_in = load_patch_vertices_in;
+			ctx.abi.store_tcs_outputs = store_tcs_output;
 		} else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) {
 			ctx.tes_primitive_mode = shaders[i]->info.tess.primitive_mode;
+			ctx.abi.load_tess_inputs = load_tes_input;
+			ctx.abi.load_tess_coord = load_tess_coord;
+			ctx.abi.load_patch_vertices_in = load_patch_vertices_in;
 		} else if (shaders[i]->info.stage == MESA_SHADER_VERTEX) {
 			if (shader_info->info.vs.needs_instance_id) {
-				ctx.shader_info->vs.vgpr_comp_cnt =
-					MAX2(3, ctx.shader_info->vs.vgpr_comp_cnt);
+				if (ctx.ac.chip_class == GFX9 &&
+				    shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL) {
+					ctx.shader_info->vs.vgpr_comp_cnt =
+						MAX2(2, ctx.shader_info->vs.vgpr_comp_cnt);
+				} else {
+					ctx.shader_info->vs.vgpr_comp_cnt =
+						MAX2(1, ctx.shader_info->vs.vgpr_comp_cnt);
+				}
 			}
 		} else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) {
 			shader_info->fs.can_discard = shaders[i]->info.fs.uses_discard;
 		}
 
 		if (i)
-			emit_barrier(&ctx);
+			emit_barrier(&ctx.ac, ctx.stage);
 
 		ac_setup_rings(&ctx);
 
@@ -6873,6 +6945,20 @@ static void ac_compile_llvm_module(LLVMTargetMachineRef tm,
 	/* +3 for scratch wave offset and VCC */
 	config->num_sgprs = MAX2(config->num_sgprs,
 	                         shader_info->num_input_sgprs + 3);
+
+	/* Enable 64-bit and 16-bit denormals, because there is no performance
+	 * cost.
+	 *
+	 * If denormals are enabled, all floating-point output modifiers are
+	 * ignored.
+	 *
+	 * Don't enable denormals for 32-bit floats, because:
+	 * - Floating-point output modifiers would be ignored by the hw.
+	 * - Some opcodes don't support denormals, such as v_mad_f32. We would
+	 *   have to stop using those.
+	 * - SI & CI would be very slow.
+	 */
+	config->float_mode |= V_00B028_FP_64_DENORMS;
 }
 
 static void
@@ -6905,7 +6991,7 @@ ac_fill_shader_info(struct ac_shader_variant_info *shader_info, struct nir_shade
         case MESA_SHADER_VERTEX:
                 shader_info->vs.as_es = options->key.vs.as_es;
                 shader_info->vs.as_ls = options->key.vs.as_ls;
-                /* in LS mode we need at least 1, invocation id needs 3, handled elsewhere */
+                /* in LS mode we need at least 1, invocation id needs 2, handled elsewhere */
                 if (options->key.vs.as_ls)
                         shader_info->vs.vgpr_comp_cnt = MAX2(1, shader_info->vs.vgpr_comp_cnt);
                 break;
@@ -6930,6 +7016,14 @@ void ac_compile_nir_shader(LLVMTargetMachineRef tm,
 	ac_compile_llvm_module(tm, llvm_module, binary, config, shader_info, nir[0]->info.stage, dump_shader, options->supports_spill);
 	for (int i = 0; i < nir_count; ++i)
 		ac_fill_shader_info(shader_info, nir[i], options);
+
+	/* Determine the ES type (VS or TES) for the GS on GFX9. */
+	if (options->chip_class == GFX9) {
+		if (nir_count == 2 &&
+		    nir[1]->info.stage == MESA_SHADER_GEOMETRY) {
+			shader_info->gs.es_type = nir[0]->info.stage;
+		}
+	}
 }
 
 static void