ac/nir: Add function creation for merged LS+HS.

[mesa.git] / src / amd / common / ac_nir_to_llvm.c
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c

index 22e915dd0dd8238e9d2a02fdfe569a568f6f4aa2..c6c56f30b8167dc57a66c67252cbc995f63a9d77 100644 (file)
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -39,6 +39,7 @@ enum radeon_llvm_calling_convention {
         RADEON_LLVM_AMDGPU_GS = 88,
         RADEON_LLVM_AMDGPU_PS = 89,
         RADEON_LLVM_AMDGPU_CS = 90,
+       RADEON_LLVM_AMDGPU_HS = 93,
  };
  
  #define CONST_ADDR_SPACE 2
@@ -107,6 +108,7 @@ struct nir_to_llvm_context {
         LLVMValueRef tcs_out_layout;
         LLVMValueRef tcs_in_layout;
         LLVMValueRef oc_lds;
+       LLVMValueRef merged_wave_info;
         LLVMValueRef tess_factor_offset;
         LLVMValueRef tcs_patch_id;
         LLVMValueRef tcs_rel_ids;
@@ -223,13 +225,15 @@ static void set_llvm_calling_convention(LLVMValueRef func,
  
         switch (stage) {
         case MESA_SHADER_VERTEX:
-       case MESA_SHADER_TESS_CTRL:
         case MESA_SHADER_TESS_EVAL:
                 calling_conv = RADEON_LLVM_AMDGPU_VS;
                 break;
         case MESA_SHADER_GEOMETRY:
                 calling_conv = RADEON_LLVM_AMDGPU_GS;
                 break;
+       case MESA_SHADER_TESS_CTRL:
+               calling_conv = HAVE_LLVM >= 0x0500 ? RADEON_LLVM_AMDGPU_HS : RADEON_LLVM_AMDGPU_VS;
+               break;
         case MESA_SHADER_FRAGMENT:
                 calling_conv = RADEON_LLVM_AMDGPU_PS;
                 break;
@@ -624,36 +628,133 @@ static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
         }
  }
  
-static void create_function(struct nir_to_llvm_context *ctx)
+static void
+radv_define_common_user_sgprs_phase1(struct nir_to_llvm_context *ctx,
+                                     gl_shader_stage stage,
+                                     bool has_previous_stage,
+                                     gl_shader_stage previous_stage,
+                                     const struct user_sgpr_info *user_sgpr_info,
+                                     struct arg_info *args,
+                                     LLVMValueRef *desc_sets)
  {
         unsigned num_sets = ctx->options->layout ? ctx->options->layout->num_sets : 0;
-       uint8_t user_sgpr_idx;
-       struct user_sgpr_info user_sgpr_info;
-       struct arg_info args = {};
-       LLVMValueRef desc_sets;
-
-       allocate_user_sgprs(ctx, &user_sgpr_info);
-       if (user_sgpr_info.need_ring_offsets && !ctx->options->supports_spill) {
-               add_user_sgpr_argument(&args, const_array(ctx->v4i32, 16), &ctx->ring_offsets); /* address of rings */
-       }
+       unsigned stage_mask = 1 << stage;
+       if (has_previous_stage)
+               stage_mask |= 1 << previous_stage;
  
         /* 1 for each descriptor set */
-       if (!user_sgpr_info.indirect_all_descriptor_sets) {
+       if (!user_sgpr_info->indirect_all_descriptor_sets) {
                 for (unsigned i = 0; i < num_sets; ++i) {
-                       if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
-                               add_user_sgpr_array_argument(&args, const_array(ctx->i8, 1024 * 1024), &ctx->descriptor_sets[i]);
+                       if (ctx->options->layout->set[i].layout->shader_stages & stage_mask) {
+                               add_user_sgpr_array_argument(args, const_array(ctx->i8, 1024 * 1024), &ctx->descriptor_sets[i]);
                         }
                 }
         } else
-               add_user_sgpr_array_argument(&args, const_array(const_array(ctx->i8, 1024 * 1024), 32), &desc_sets);
+               add_user_sgpr_array_argument(args, const_array(const_array(ctx->i8, 1024 * 1024), 32), desc_sets);
  
         if (ctx->shader_info->info.needs_push_constants) {
                 /* 1 for push constants and dynamic descriptors */
-               add_user_sgpr_array_argument(&args, const_array(ctx->i8, 1024 * 1024), &ctx->push_constants);
+               add_user_sgpr_array_argument(args, const_array(ctx->i8, 1024 * 1024), &ctx->push_constants);
         }
+}
  
-       switch (ctx->stage) {
+static void
+radv_define_common_user_sgprs_phase2(struct nir_to_llvm_context *ctx,
+                                     gl_shader_stage stage,
+                                     bool has_previous_stage,
+                                     gl_shader_stage previous_stage,
+                                     const struct user_sgpr_info *user_sgpr_info,
+                                    LLVMValueRef desc_sets,
+                                     uint8_t *user_sgpr_idx)
+{
+       unsigned num_sets = ctx->options->layout ? ctx->options->layout->num_sets : 0;
+       unsigned stage_mask = 1 << stage;
+       if (has_previous_stage)
+               stage_mask |= 1 << previous_stage;
+
+       if (!user_sgpr_info->indirect_all_descriptor_sets) {
+               for (unsigned i = 0; i < num_sets; ++i) {
+                       if (ctx->options->layout->set[i].layout->shader_stages & stage_mask) {
+                               set_userdata_location(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], user_sgpr_idx, 2);
+                       } else
+                               ctx->descriptor_sets[i] = NULL;
+               }
+       } else {
+               uint32_t desc_sgpr_idx = *user_sgpr_idx;
+               set_userdata_location_shader(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS, user_sgpr_idx, 2);
+
+               for (unsigned i = 0; i < num_sets; ++i) {
+                       if (ctx->options->layout->set[i].layout->shader_stages & stage_mask) {
+                               set_userdata_location_indirect(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], desc_sgpr_idx, 2, i * 8);
+                               ctx->descriptor_sets[i] = ac_build_load_to_sgpr(&ctx->ac, desc_sets, LLVMConstInt(ctx->i32, i, false));
+
+                       } else
+                               ctx->descriptor_sets[i] = NULL;
+               }
+               ctx->shader_info->need_indirect_descriptor_sets = true;
+       }
+
+       if (ctx->shader_info->info.needs_push_constants) {
+               set_userdata_location_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
+       }
+}
+
+static void
+radv_define_vs_user_sgprs_phase1(struct nir_to_llvm_context *ctx,
+                                 gl_shader_stage stage,
+                                 bool has_previous_stage,
+                                 gl_shader_stage previous_stage,
+                                 struct arg_info *args)
+{
+       if (!ctx->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
+               if (ctx->shader_info->info.vs.has_vertex_buffers)
+                       add_user_sgpr_argument(args, const_array(ctx->v4i32, 16), &ctx->vertex_buffers); /* vertex buffers */
+               add_user_sgpr_argument(args, ctx->i32, &ctx->abi.base_vertex); // base vertex
+               add_user_sgpr_argument(args, ctx->i32, &ctx->abi.start_instance);// start instance
+               if (ctx->shader_info->info.vs.needs_draw_id)
+                       add_user_sgpr_argument(args, ctx->i32, &ctx->abi.draw_id); // draw id
+       }
+}
+
+static void
+radv_define_vs_user_sgprs_phase2(struct nir_to_llvm_context *ctx,
+                                 gl_shader_stage stage,
+                                 bool has_previous_stage,
+                                 gl_shader_stage previous_stage,
+                                 uint8_t *user_sgpr_idx)
+{
+       if (!ctx->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
+               if (ctx->shader_info->info.vs.has_vertex_buffers) {
+                       set_userdata_location_shader(ctx, AC_UD_VS_VERTEX_BUFFERS, user_sgpr_idx, 2);
+               }
+               unsigned vs_num = 2;
+               if (ctx->shader_info->info.vs.needs_draw_id)
+                       vs_num++;
+
+               set_userdata_location_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, user_sgpr_idx, vs_num);
+       }
+}
+
+
+static void create_function(struct nir_to_llvm_context *ctx,
+                            gl_shader_stage stage,
+                            bool has_previous_stage,
+                            gl_shader_stage previous_stage)
+{
+       uint8_t user_sgpr_idx;
+       struct user_sgpr_info user_sgpr_info;
+       struct arg_info args = {};
+       LLVMValueRef desc_sets;
+
+       allocate_user_sgprs(ctx, &user_sgpr_info);
+
+       if (user_sgpr_info.need_ring_offsets && !ctx->options->supports_spill) {
+               add_user_sgpr_argument(&args, const_array(ctx->v4i32, 16), &ctx->ring_offsets); /* address of rings */
+       }
+
+       switch (stage) {
         case MESA_SHADER_COMPUTE:
+               radv_define_common_user_sgprs_phase1(ctx, stage, has_previous_stage, previous_stage, &user_sgpr_info, &args, &desc_sets);
                 if (ctx->shader_info->info.cs.grid_components_used)
                         add_user_sgpr_argument(&args, LLVMVectorType(ctx->i32, ctx->shader_info->info.cs.grid_components_used), &ctx->num_work_groups); /* grid size */
                 add_sgpr_argument(&args, LLVMVectorType(ctx->i32, 3), &ctx->workgroup_ids);
@@ -661,14 +762,8 @@ static void create_function(struct nir_to_llvm_context *ctx)
                 add_vgpr_argument(&args, LLVMVectorType(ctx->i32, 3), &ctx->local_invocation_ids);
                 break;
         case MESA_SHADER_VERTEX:
-               if (!ctx->is_gs_copy_shader) {
-                       if (ctx->shader_info->info.vs.has_vertex_buffers)
-                               add_user_sgpr_argument(&args, const_array(ctx->v4i32, 16), &ctx->vertex_buffers); /* vertex buffers */
-                       add_user_sgpr_argument(&args, ctx->i32, &ctx->abi.base_vertex); // base vertex
-                       add_user_sgpr_argument(&args, ctx->i32, &ctx->abi.start_instance);// start instance
-                       if (ctx->shader_info->info.vs.needs_draw_id)
-                               add_user_sgpr_argument(&args, ctx->i32, &ctx->abi.draw_id); // draw id
-               }
+               radv_define_common_user_sgprs_phase1(ctx, stage, has_previous_stage, previous_stage, &user_sgpr_info, &args, &desc_sets);
+               radv_define_vs_user_sgprs_phase1(ctx, stage, has_previous_stage, previous_stage, &args);
                 if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls && ctx->options->key.has_multiview_view_index))
                         add_user_sgpr_argument(&args, ctx->i32, &ctx->view_index);
                 if (ctx->options->key.vs.as_es)
@@ -683,18 +778,49 @@ static void create_function(struct nir_to_llvm_context *ctx)
                 }
                 break;
         case MESA_SHADER_TESS_CTRL:
-               add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_offchip_layout); // tcs offchip layout
-               add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_out_offsets); // tcs out offsets
-               add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_out_layout); // tcs out layout
-               add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_in_layout); // tcs in layout
-               if (ctx->shader_info->info.needs_multiview_view_index)
-                       add_user_sgpr_argument(&args, ctx->i32, &ctx->view_index);
-               add_sgpr_argument(&args, ctx->i32, &ctx->oc_lds); // param oc lds
-               add_sgpr_argument(&args, ctx->i32, &ctx->tess_factor_offset); // tess factor offset
-               add_vgpr_argument(&args, ctx->i32, &ctx->tcs_patch_id); // patch id
-               add_vgpr_argument(&args, ctx->i32, &ctx->tcs_rel_ids); // rel ids;
+               if (has_previous_stage) {
+                       // First 6 system regs
+                       add_sgpr_argument(&args, ctx->i32, &ctx->oc_lds); // param oc lds
+                       add_sgpr_argument(&args, ctx->i32, &ctx->merged_wave_info); // merged wave info
+                       add_sgpr_argument(&args, ctx->i32, &ctx->tess_factor_offset); // tess factor offset
+
+                       add_sgpr_argument(&args, ctx->i32, NULL); // scratch offset
+                       add_sgpr_argument(&args, ctx->i32, NULL); // unknown
+                       add_sgpr_argument(&args, ctx->i32, NULL); // unknown
+
+                       radv_define_common_user_sgprs_phase1(ctx, stage, has_previous_stage, previous_stage, &user_sgpr_info, &args, &desc_sets);
+                       radv_define_vs_user_sgprs_phase1(ctx, stage, has_previous_stage, previous_stage, &args);
+                       add_user_sgpr_argument(&args, ctx->i32, &ctx->ls_out_layout); // ls out layout
+
+                       add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_offchip_layout); // tcs offchip layout
+                       add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_out_offsets); // tcs out offsets
+                       add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_out_layout); // tcs out layout
+                       add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_in_layout); // tcs in layout
+                       if (ctx->shader_info->info.needs_multiview_view_index)
+                               add_user_sgpr_argument(&args, ctx->i32, &ctx->view_index);
+
+                       add_vgpr_argument(&args, ctx->i32, &ctx->tcs_patch_id); // patch id
+                       add_vgpr_argument(&args, ctx->i32, &ctx->tcs_rel_ids); // rel ids;
+                       add_vgpr_argument(&args, ctx->i32, &ctx->abi.vertex_id); // vertex id
+                       add_vgpr_argument(&args, ctx->i32, &ctx->rel_auto_id); // rel auto id
+                       add_vgpr_argument(&args, ctx->i32, &ctx->vs_prim_id); // vs prim id
+                       add_vgpr_argument(&args, ctx->i32, &ctx->abi.instance_id); // instance id
+               } else {
+                       radv_define_common_user_sgprs_phase1(ctx, stage, has_previous_stage, previous_stage, &user_sgpr_info, &args, &desc_sets);
+                       add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_offchip_layout); // tcs offchip layout
+                       add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_out_offsets); // tcs out offsets
+                       add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_out_layout); // tcs out layout
+                       add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_in_layout); // tcs in layout
+                       if (ctx->shader_info->info.needs_multiview_view_index)
+                               add_user_sgpr_argument(&args, ctx->i32, &ctx->view_index);
+                       add_sgpr_argument(&args, ctx->i32, &ctx->oc_lds); // param oc lds
+                       add_sgpr_argument(&args, ctx->i32, &ctx->tess_factor_offset); // tess factor offset
+                       add_vgpr_argument(&args, ctx->i32, &ctx->tcs_patch_id); // patch id
+                       add_vgpr_argument(&args, ctx->i32, &ctx->tcs_rel_ids); // rel ids;
+               }
                 break;
         case MESA_SHADER_TESS_EVAL:
+               radv_define_common_user_sgprs_phase1(ctx, stage, has_previous_stage, previous_stage, &user_sgpr_info, &args, &desc_sets);
                 add_user_sgpr_argument(&args, ctx->i32, &ctx->tcs_offchip_layout); // tcs offchip layout
                 if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.tes.as_es && ctx->options->key.has_multiview_view_index))
                         add_user_sgpr_argument(&args, ctx->i32, &ctx->view_index);
@@ -712,6 +838,8 @@ static void create_function(struct nir_to_llvm_context *ctx)
                 add_vgpr_argument(&args, ctx->i32, &ctx->tes_patch_id); // tes patch id
                 break;
         case MESA_SHADER_GEOMETRY:
+               radv_define_common_user_sgprs_phase1(ctx, stage, has_previous_stage, previous_stage, &user_sgpr_info, &args, &desc_sets);
+               radv_define_vs_user_sgprs_phase1(ctx, stage, has_previous_stage, previous_stage, &args);
                 add_user_sgpr_argument(&args, ctx->i32, &ctx->gsvs_ring_stride); // gsvs stride
                 add_user_sgpr_argument(&args, ctx->i32, &ctx->gsvs_num_entries); // gsvs num entires
                 if (ctx->shader_info->info.needs_multiview_view_index)
@@ -728,6 +856,7 @@ static void create_function(struct nir_to_llvm_context *ctx)
                 add_vgpr_argument(&args, ctx->i32, &ctx->gs_invocation_id);
                 break;
         case MESA_SHADER_FRAGMENT:
+               radv_define_common_user_sgprs_phase1(ctx, stage, has_previous_stage, previous_stage, &user_sgpr_info, &args, &desc_sets);
                 if (ctx->shader_info->info.ps.needs_sample_positions)
                         add_user_sgpr_argument(&args, ctx->i32, &ctx->sample_pos_offset); /* sample position offset */
                 add_sgpr_argument(&args, ctx->i32, &ctx->prim_mask); /* prim mask */
@@ -756,14 +885,12 @@ static void create_function(struct nir_to_llvm_context *ctx)
             ctx->context, ctx->module, ctx->builder, NULL, 0, &args,
             ctx->max_workgroup_size,
             ctx->options->unsafe_math);
-       set_llvm_calling_convention(ctx->main_function, ctx->stage);
+       set_llvm_calling_convention(ctx->main_function, stage);
  
  
         ctx->shader_info->num_input_vgprs = 0;
-       ctx->shader_info->num_input_sgprs = ctx->shader_info->num_user_sgprs =
-         ctx->options->supports_spill ? 2 : 0;
+       ctx->shader_info->num_input_sgprs = ctx->options->supports_spill ? 2 : 0;
  
-       ctx->shader_info->num_user_sgprs += args.num_user_sgprs_used;
         ctx->shader_info->num_input_sgprs += args.num_sgprs_used;
  
         if (ctx->stage != MESA_SHADER_FRAGMENT)
@@ -783,50 +910,22 @@ static void create_function(struct nir_to_llvm_context *ctx)
                                                              const_array(ctx->v4i32, 16), "");
                 }
         }
+       
+       /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including
+        * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */
+       if (has_previous_stage)
+               user_sgpr_idx = 0;
  
-       if (!user_sgpr_info.indirect_all_descriptor_sets) {
-               for (unsigned i = 0; i < num_sets; ++i) {
-                       if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
-                               set_userdata_location(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], &user_sgpr_idx, 2);
-                       } else
-                               ctx->descriptor_sets[i] = NULL;
-               }
-       } else {
-               uint32_t desc_sgpr_idx = user_sgpr_idx;
-               set_userdata_location_shader(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS, &user_sgpr_idx, 2);
-
-               for (unsigned i = 0; i < num_sets; ++i) {
-                       if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
-                               set_userdata_location_indirect(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], desc_sgpr_idx, 2, i * 8);
-                               ctx->descriptor_sets[i] = ac_build_indexed_load_const(&ctx->ac, desc_sets, LLVMConstInt(ctx->i32, i, false));
-
-                       } else
-                               ctx->descriptor_sets[i] = NULL;
-               }
-               ctx->shader_info->need_indirect_descriptor_sets = true;
-       }
-
-       if (ctx->shader_info->info.needs_push_constants) {
-               set_userdata_location_shader(ctx, AC_UD_PUSH_CONSTANTS, &user_sgpr_idx, 2);
-       }
+       radv_define_common_user_sgprs_phase2(ctx, stage, has_previous_stage, previous_stage, &user_sgpr_info, desc_sets, &user_sgpr_idx);
  
-       switch (ctx->stage) {
+       switch (stage) {
         case MESA_SHADER_COMPUTE:
                 if (ctx->shader_info->info.cs.grid_components_used) {
                         set_userdata_location_shader(ctx, AC_UD_CS_GRID_SIZE, &user_sgpr_idx, ctx->shader_info->info.cs.grid_components_used);
                 }
                 break;
         case MESA_SHADER_VERTEX:
-               if (!ctx->is_gs_copy_shader) {
-                       if (ctx->shader_info->info.vs.has_vertex_buffers) {
-                               set_userdata_location_shader(ctx, AC_UD_VS_VERTEX_BUFFERS, &user_sgpr_idx, 2);
-                       }
-                       unsigned vs_num = 2;
-                       if (ctx->shader_info->info.vs.needs_draw_id)
-                               vs_num++;
-
-                       set_userdata_location_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, &user_sgpr_idx, vs_num);
-               }
+               radv_define_vs_user_sgprs_phase2(ctx, stage, has_previous_stage, previous_stage, &user_sgpr_idx);
                 if (ctx->view_index)
                         set_userdata_location_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
                 if (ctx->options->key.vs.as_ls) {
@@ -836,6 +935,9 @@ static void create_function(struct nir_to_llvm_context *ctx)
                         declare_tess_lds(ctx);
                 break;
         case MESA_SHADER_TESS_CTRL:
+               radv_define_vs_user_sgprs_phase2(ctx, stage, has_previous_stage, previous_stage, &user_sgpr_idx);
+               if (has_previous_stage)
+                       set_userdata_location_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT, &user_sgpr_idx, 1);
                 set_userdata_location_shader(ctx, AC_UD_TCS_OFFCHIP_LAYOUT, &user_sgpr_idx, 4);
                 if (ctx->view_index)
                         set_userdata_location_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
@@ -847,6 +949,7 @@ static void create_function(struct nir_to_llvm_context *ctx)
                         set_userdata_location_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
                 break;
         case MESA_SHADER_GEOMETRY:
+               radv_define_vs_user_sgprs_phase2(ctx, stage, has_previous_stage, previous_stage, &user_sgpr_idx);
                 set_userdata_location_shader(ctx, AC_UD_GS_VS_RING_STRIDE_ENTRIES, &user_sgpr_idx, 2);
                 if (ctx->view_index)
                         set_userdata_location_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
@@ -859,6 +962,8 @@ static void create_function(struct nir_to_llvm_context *ctx)
         default:
                 unreachable("Shader stage not implemented");
         }
+
+       ctx->shader_info->num_user_sgprs = user_sgpr_idx;
  }
  
  static void setup_types(struct nir_to_llvm_context *ctx)
@@ -1414,7 +1519,6 @@ static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx,
         unsigned mask;
         int idx;
         LLVMValueRef result;
-       bool has_ds_bpermute = ctx->abi->chip_class >= VI;
  
         if (op == nir_op_fddx_fine || op == nir_op_fddx)
                 mask = AC_TID_MASK_LEFT;
@@ -1431,9 +1535,7 @@ static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx,
         else
                 idx = 2;
  
-       result = ac_build_ddxy(&ctx->ac, has_ds_bpermute,
-                             mask, idx,
-                             src0);
+       result = ac_build_ddxy(&ctx->ac, mask, idx, src0);
         return result;
  }
  
@@ -1710,7 +1812,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                                                       result);
                 break;
         case nir_op_ffma:
-               result = emit_intrin_3f_param(&ctx->ac, "llvm.fma",
+               result = emit_intrin_3f_param(&ctx->ac, "llvm.fmuladd",
                                               ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]);
                 break;
         case nir_op_ibitfield_extract:
@@ -1936,7 +2038,7 @@ get_buffer_size(struct ac_nir_context *ctx, LLVMValueRef descriptor, bool in_ele
                                         LLVMConstInt(ctx->ac.i32, 2, false), "");
  
         /* VI only */
-       if (ctx->abi->chip_class == VI && in_elements) {
+       if (ctx->ac.chip_class == VI && in_elements) {
                 /* On VI, the descriptor contains the size in bytes,
                  * but TXQ must return the size in elements.
                  * The stride is always non-zero for resources using TXQ.
@@ -2142,7 +2244,7 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
                 break;
         }
  
-       if (instr->op == nir_texop_tg4 && ctx->abi->chip_class <= VI) {
+       if (instr->op == nir_texop_tg4 && ctx->ac.chip_class <= VI) {
                 enum glsl_base_type stype = glsl_get_sampler_result_type(instr->texture->var->type);
                 if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) {
                         return radv_lower_gather4_integer(&ctx->ac, args, instr);
@@ -2527,7 +2629,7 @@ lds_load(struct nir_to_llvm_context *ctx,
          LLVMValueRef dw_addr)
  {
         LLVMValueRef value;
-       value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
+       value = ac_build_load(&ctx->ac, ctx->lds, dw_addr);
         return value;
  }
  
@@ -3269,7 +3371,7 @@ static LLVMValueRef get_image_coords(struct ac_nir_context *ctx,
                              dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
         bool is_ms = (dim == GLSL_SAMPLER_DIM_MS ||
                       dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
-       bool gfx9_1d = ctx->abi->chip_class >= GFX9 && dim == GLSL_SAMPLER_DIM_1D;
+       bool gfx9_1d = ctx->ac.chip_class >= GFX9 && dim == GLSL_SAMPLER_DIM_1D;
         count = image_type_to_components_count(dim, is_array);
  
         if (is_ms) {
@@ -3414,7 +3516,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
         LLVMValueRef i1false = LLVMConstInt(ctx->ac.i1, 0, false);
         LLVMValueRef i1true = LLVMConstInt(ctx->ac.i1, 1, false);
         LLVMValueRef glc = i1false;
-       bool force_glc = ctx->abi->chip_class == SI;
+       bool force_glc = ctx->ac.chip_class == SI;
         if (force_glc)
                 glc = i1true;
  
@@ -3466,7 +3568,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
  static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
                                         const nir_intrinsic_instr *instr)
  {
-       LLVMValueRef params[6];
+       LLVMValueRef params[7];
         int param_count = 0;
         const nir_variable *var = instr->variables[0]->var;
  
@@ -3579,7 +3681,8 @@ static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
                 z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
                 res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
         }
-       if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
+       if (ctx->ac.chip_class >= GFX9 &&
+           glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
             glsl_sampler_type_is_array(type)) {
                 LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
                 res = LLVMBuildInsertElement(ctx->ac.builder, res, layers,
@@ -3741,7 +3844,7 @@ static LLVMValueRef load_sample_position(struct nir_to_llvm_context *ctx,
                                const_array(ctx->v2f32, 64), "");
  
         sample_id = LLVMBuildAdd(ctx->builder, sample_id, ctx->sample_pos_offset, "");
-       result = ac_build_indexed_load(&ctx->ac, ptr, sample_id, false);
+       result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
  
         return result;
  }
@@ -4232,7 +4335,7 @@ static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi,
         list = ac_build_gep0(&ctx->ac, list, LLVMConstInt(ctx->i32, offset, 0));
         list = LLVMBuildPointerCast(builder, list, const_array(type, 0), "");
  
-       return ac_build_indexed_load_const(&ctx->ac, list, index);
+       return ac_build_load_to_sgpr(&ctx->ac, list, index);
  }
  
  static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
@@ -4329,7 +4432,7 @@ static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx,
         LLVMBuilderRef builder = ctx->ac.builder;
         LLVMValueRef img7, samp0;
  
-       if (ctx->abi->chip_class >= VI)
+       if (ctx->ac.chip_class >= VI)
                 return samp;
  
         img7 = LLVMBuildExtractElement(builder, res,
@@ -4504,7 +4607,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                  * It's unnecessary if the original texture format was
                  * Z32_FLOAT, but we don't know that here.
                  */
-               if (ctx->abi->chip_class == VI)
+               if (ctx->ac.chip_class == VI)
                         z = ac_build_clamp(&ctx->ac, z);
  
                 address[count++] = z;
@@ -4528,7 +4631,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                         break;
                 case GLSL_SAMPLER_DIM_1D:
                         num_src_deriv_channels = 1;
-                       if (ctx->abi->chip_class >= GFX9) {
+                       if (ctx->ac.chip_class >= GFX9) {
                                 num_dest_deriv_channels = 2;
                                 num_deriv_comp = 2;
                         } else {
@@ -4549,15 +4652,13 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
         }
  
         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && coord) {
-               if (instr->is_array && instr->op != nir_texop_lod)
-                       coords[3] = apply_round_slice(&ctx->ac, coords[3]);
                 for (chan = 0; chan < instr->coord_components; chan++)
                         coords[chan] = ac_to_float(&ctx->ac, coords[chan]);
                 if (instr->coord_components == 3)
                         coords[3] = LLVMGetUndef(ctx->ac.f32);
                 ac_prepare_cube_coords(&ctx->ac,
                         instr->op == nir_texop_txd, instr->is_array,
-                       coords, derivs);
+                       instr->op == nir_texop_lod, coords, derivs);
                 if (num_deriv_comp)
                         num_deriv_comp--;
         }
@@ -4586,7 +4687,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                         address[count++] = coords[2];
                 }
  
-               if (ctx->abi->chip_class >= GFX9) {
+               if (ctx->ac.chip_class >= GFX9) {
                         LLVMValueRef filler;
                         if (instr->op == nir_texop_txf)
                                 filler = ctx->ac.i32_0;
@@ -4698,7 +4799,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                 LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, result, two, "");
                 z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
                 result = LLVMBuildInsertElement(ctx->ac.builder, result, z, two, "");
-       } else if (ctx->abi->chip_class >= GFX9 &&
+       } else if (ctx->ac.chip_class >= GFX9 &&
                    instr->op == nir_texop_txs &&
                    instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
                    instr->is_array) {
@@ -4931,7 +5032,7 @@ handle_vs_input_decl(struct nir_to_llvm_context *ctx,
         for (unsigned i = 0; i < attrib_count; ++i, ++idx) {
                 t_offset = LLVMConstInt(ctx->i32, index + i, false);
  
-               t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
+               t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
  
                 input = ac_build_buffer_load_format(&ctx->ac, t_list,
                                                     buffer_index,
@@ -5152,7 +5253,9 @@ static LLVMValueRef si_build_alloca_undef(struct ac_llvm_context *ac,
  
  static void
  scan_shader_output_decl(struct nir_to_llvm_context *ctx,
-                       struct nir_variable *variable)
+                       struct nir_variable *variable,
+                       struct nir_shader *shader,
+                       gl_shader_stage stage)
  {
         int idx = variable->data.location + variable->data.index;
         unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
@@ -5161,22 +5264,23 @@ scan_shader_output_decl(struct nir_to_llvm_context *ctx,
         variable->data.driver_location = idx * 4;
  
         /* tess ctrl has it's own load/store paths for outputs */
-       if (ctx->stage == MESA_SHADER_TESS_CTRL)
+       if (stage == MESA_SHADER_TESS_CTRL)
                 return;
  
         mask_attribs = ((1ull << attrib_count) - 1) << idx;
-       if (ctx->stage == MESA_SHADER_VERTEX ||
-           ctx->stage == MESA_SHADER_TESS_EVAL ||
-           ctx->stage == MESA_SHADER_GEOMETRY) {
+       if (stage == MESA_SHADER_VERTEX ||
+           stage == MESA_SHADER_TESS_EVAL ||
+           stage == MESA_SHADER_GEOMETRY) {
                 if (idx == VARYING_SLOT_CLIP_DIST0) {
-                       int length = ctx->num_output_clips + ctx->num_output_culls;
-                       if (ctx->stage == MESA_SHADER_VERTEX) {
-                               ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << ctx->num_output_clips) - 1;
-                               ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << ctx->num_output_culls) - 1;
+                       int length = shader->info.clip_distance_array_size +
+                                    shader->info.cull_distance_array_size;
+                       if (stage == MESA_SHADER_VERTEX) {
+                               ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1;
+                               ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1;
                         }
-                       if (ctx->stage == MESA_SHADER_TESS_EVAL) {
-                               ctx->shader_info->tes.outinfo.clip_dist_mask = (1 << ctx->num_output_clips) - 1;
-                               ctx->shader_info->tes.outinfo.cull_dist_mask = (1 << ctx->num_output_culls) - 1;
+                       if (stage == MESA_SHADER_TESS_EVAL) {
+                               ctx->shader_info->tes.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1;
+                               ctx->shader_info->tes.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1;
                         }
  
                         if (length > 4)
@@ -6221,16 +6325,16 @@ ac_setup_rings(struct nir_to_llvm_context *ctx)
  {
         if ((ctx->stage == MESA_SHADER_VERTEX && ctx->options->key.vs.as_es) ||
             (ctx->stage == MESA_SHADER_TESS_EVAL && ctx->options->key.tes.as_es)) {
-               ctx->esgs_ring = ac_build_indexed_load_const(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_ESGS_VS, false));
+               ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_ESGS_VS, false));
         }
  
         if (ctx->is_gs_copy_shader) {
-               ctx->gsvs_ring = ac_build_indexed_load_const(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_GSVS_VS, false));
+               ctx->gsvs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_GSVS_VS, false));
         }
         if (ctx->stage == MESA_SHADER_GEOMETRY) {
                 LLVMValueRef tmp;
-               ctx->esgs_ring = ac_build_indexed_load_const(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_ESGS_GS, false));
-               ctx->gsvs_ring = ac_build_indexed_load_const(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_GSVS_GS, false));
+               ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_ESGS_GS, false));
+               ctx->gsvs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_GSVS_GS, false));
  
                 ctx->gsvs_ring = LLVMBuildBitCast(ctx->builder, ctx->gsvs_ring, ctx->v4i32, "");
  
@@ -6242,8 +6346,8 @@ ac_setup_rings(struct nir_to_llvm_context *ctx)
  
         if (ctx->stage == MESA_SHADER_TESS_CTRL ||
             ctx->stage == MESA_SHADER_TESS_EVAL) {
-               ctx->hs_ring_tess_offchip = ac_build_indexed_load_const(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_HS_TESS_OFFCHIP, false));
-               ctx->hs_ring_tess_factor = ac_build_indexed_load_const(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_HS_TESS_FACTOR, false));
+               ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_HS_TESS_OFFCHIP, false));
+               ctx->hs_ring_tess_factor = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_HS_TESS_FACTOR, false));
         }
  }
  
@@ -6330,7 +6434,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
         ctx.context = LLVMContextCreate();
         ctx.module = LLVMModuleCreateWithNameInContext("shader", ctx.context);
  
-       ac_llvm_context_init(&ctx.ac, ctx.context);
+       ac_llvm_context_init(&ctx.ac, ctx.context, options->chip_class);
         ctx.ac.module = ctx.module;
  
         memset(shader_info, 0, sizeof(*shader_info));
@@ -6357,7 +6461,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
         for (i = 0; i < AC_UD_MAX_UD; i++)
                 shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
  
-       create_function(&ctx);
+       create_function(&ctx, nir->stage, false, MESA_SHADER_VERTEX);
  
         if (nir->stage == MESA_SHADER_GEOMETRY) {
                 ctx.gs_next_vertex = ac_build_alloca(&ctx.ac, ctx.i32, "gs_next_vertex");
@@ -6384,14 +6488,13 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
         else if(nir->stage == MESA_SHADER_VERTEX)
                 handle_vs_inputs(&ctx, nir);
  
-       ctx.abi.chip_class = options->chip_class;
         ctx.abi.inputs = &ctx.inputs[0];
         ctx.abi.emit_outputs = handle_shader_outputs_post;
         ctx.abi.load_ssbo = radv_load_ssbo;
         ctx.abi.load_sampler_desc = radv_get_sampler_desc;
  
         nir_foreach_variable(variable, &nir->outputs)
-               scan_shader_output_decl(&ctx, variable);
+               scan_shader_output_decl(&ctx, variable, nir, nir->stage);
  
         ac_nir_translate(&ctx.ac, &ctx.abi, nir, &ctx);
  
@@ -6542,53 +6645,61 @@ static void ac_compile_llvm_module(LLVMTargetMachineRef tm,
                                  shader_info->num_input_sgprs + 3);
  }
  
+static void
+ac_fill_shader_info(struct ac_shader_variant_info *shader_info, struct nir_shader *nir, const struct ac_nir_compiler_options *options)
+{
+        switch (nir->stage) {
+        case MESA_SHADER_COMPUTE:
+                for (int i = 0; i < 3; ++i)
+                        shader_info->cs.block_size[i] = nir->info.cs.local_size[i];
+                break;
+        case MESA_SHADER_FRAGMENT:
+                shader_info->fs.early_fragment_test = nir->info.fs.early_fragment_tests;
+                break;
+        case MESA_SHADER_GEOMETRY:
+                shader_info->gs.vertices_in = nir->info.gs.vertices_in;
+                shader_info->gs.vertices_out = nir->info.gs.vertices_out;
+                shader_info->gs.output_prim = nir->info.gs.output_primitive;
+                shader_info->gs.invocations = nir->info.gs.invocations;
+                break;
+        case MESA_SHADER_TESS_EVAL:
+                shader_info->tes.primitive_mode = nir->info.tess.primitive_mode;
+                shader_info->tes.spacing = nir->info.tess.spacing;
+                shader_info->tes.ccw = nir->info.tess.ccw;
+                shader_info->tes.point_mode = nir->info.tess.point_mode;
+                shader_info->tes.as_es = options->key.tes.as_es;
+                break;
+        case MESA_SHADER_TESS_CTRL:
+                shader_info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out;
+                break;
+        case MESA_SHADER_VERTEX:
+                shader_info->vs.as_es = options->key.vs.as_es;
+                shader_info->vs.as_ls = options->key.vs.as_ls;
+                /* in LS mode we need at least 1, invocation id needs 3, handled elsewhere */
+                if (options->key.vs.as_ls)
+                        shader_info->vs.vgpr_comp_cnt = MAX2(1, shader_info->vs.vgpr_comp_cnt);
+                break;
+        default:
+                break;
+        }
+}
+
  void ac_compile_nir_shader(LLVMTargetMachineRef tm,
                             struct ac_shader_binary *binary,
                             struct ac_shader_config *config,
                             struct ac_shader_variant_info *shader_info,
-                           struct nir_shader *nir,
+                           struct nir_shader *const *nir,
+                           int nir_count,
                             const struct ac_nir_compiler_options *options,
                            bool dump_shader)
  {
  
-       LLVMModuleRef llvm_module = ac_translate_nir_to_llvm(tm, nir, shader_info,
+       LLVMModuleRef llvm_module = ac_translate_nir_to_llvm(tm, nir[0], shader_info,
                                                              options);
  
-       ac_compile_llvm_module(tm, llvm_module, binary, config, shader_info, nir->stage, dump_shader, options->supports_spill);
-       switch (nir->stage) {
-       case MESA_SHADER_COMPUTE:
-               for (int i = 0; i < 3; ++i)
-                       shader_info->cs.block_size[i] = nir->info.cs.local_size[i];
-               break;
-       case MESA_SHADER_FRAGMENT:
-               shader_info->fs.early_fragment_test = nir->info.fs.early_fragment_tests;
-               break;
-       case MESA_SHADER_GEOMETRY:
-               shader_info->gs.vertices_in = nir->info.gs.vertices_in;
-               shader_info->gs.vertices_out = nir->info.gs.vertices_out;
-               shader_info->gs.output_prim = nir->info.gs.output_primitive;
-               shader_info->gs.invocations = nir->info.gs.invocations;
-               break;
-       case MESA_SHADER_TESS_EVAL:
-               shader_info->tes.primitive_mode = nir->info.tess.primitive_mode;
-               shader_info->tes.spacing = nir->info.tess.spacing;
-               shader_info->tes.ccw = nir->info.tess.ccw;
-               shader_info->tes.point_mode = nir->info.tess.point_mode;
-               shader_info->tes.as_es = options->key.tes.as_es;
-               break;
-       case MESA_SHADER_TESS_CTRL:
-               shader_info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out;
-               break;
-       case MESA_SHADER_VERTEX:
-               shader_info->vs.as_es = options->key.vs.as_es;
-               shader_info->vs.as_ls = options->key.vs.as_ls;
-               /* in LS mode we need at least 1, invocation id needs 3, handled elsewhere */
-               if (options->key.vs.as_ls)
-                       shader_info->vs.vgpr_comp_cnt = MAX2(1, shader_info->vs.vgpr_comp_cnt);
-               break;
-       default:
-               break;
-       }
+       ac_compile_llvm_module(tm, llvm_module, binary, config, shader_info, nir[0]->stage, dump_shader, options->supports_spill);
+       for (int i = 0; i < nir_count; ++i)
+               ac_fill_shader_info(shader_info, nir[i], options);
  }
  
  static void
@@ -6654,7 +6765,7 @@ void ac_create_gs_copy_shader(LLVMTargetMachineRef tm,
         ctx.options = options;
         ctx.shader_info = shader_info;
  
-       ac_llvm_context_init(&ctx.ac, ctx.context);
+       ac_llvm_context_init(&ctx.ac, ctx.context, options->chip_class);
         ctx.ac.module = ctx.module;
  
         ctx.is_gs_copy_shader = true;
@@ -6665,7 +6776,7 @@ void ac_create_gs_copy_shader(LLVMTargetMachineRef tm,
         ctx.ac.builder = ctx.builder;
         ctx.stage = MESA_SHADER_VERTEX;
  
-       create_function(&ctx);
+       create_function(&ctx, MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX);
  
         ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out;
         ac_setup_rings(&ctx);
@@ -6681,7 +6792,7 @@ void ac_create_gs_copy_shader(LLVMTargetMachineRef tm,
         ctx.nir = &nir_ctx;
  
         nir_foreach_variable(variable, &geom_shader->outputs) {
-               scan_shader_output_decl(&ctx, variable);
+               scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
                 handle_shader_output_decl(&nir_ctx, geom_shader, variable);
         }