radeonsi: change the bit-packing of LS out/TCS in data

[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c

index 02001725fd87376062907412e825732e15413fa6..5c17c640a3c9462500594b3f040eea3f6c376a9b 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -72,6 +72,8 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
  static void si_dump_shader_key(unsigned shader, struct si_shader_key *key,
                                FILE *f);
  
+static unsigned llvm_get_type_size(LLVMTypeRef type);
+
  static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                         union si_shader_part_key *key);
  static void si_build_vs_epilog_function(struct si_shader_context *ctx,
@@ -225,9 +227,9 @@ static LLVMValueRef
  get_tcs_in_patch_stride(struct si_shader_context *ctx)
  {
         if (ctx->type == PIPE_SHADER_VERTEX)
-               return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
+               return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 8, 13);
         else if (ctx->type == PIPE_SHADER_TESS_CTRL)
-               return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
+               return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 8, 13);
         else {
                 assert(0);
                 return NULL;
@@ -303,7 +305,7 @@ static LLVMValueRef get_instance_index_for_fetch(
         struct si_shader_context *ctx,
         unsigned param_start_instance, unsigned divisor)
  {
-       struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
  
         LLVMValueRef result = LLVMGetParam(ctx->main_fn,
                                            ctx->param_instance_id);
@@ -338,8 +340,7 @@ static void declare_input_vs(
         const struct tgsi_full_declaration *decl,
         LLVMValueRef out[4])
  {
-       struct lp_build_context *base = &ctx->bld_base.base;
-       struct gallivm_state *gallivm = base->gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
  
         unsigned chan;
         unsigned fix_fetch;
@@ -574,7 +575,7 @@ static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
                                        const struct tgsi_ind_register *ind,
                                        int rel_index)
  {
-       struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMValueRef result;
  
         result = ctx->addrs[ind->Index][ind->Swizzle];
@@ -614,7 +615,7 @@ static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
                                    LLVMValueRef vertex_dw_stride,
                                    LLVMValueRef base_addr)
  {
-       struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         struct tgsi_shader_info *info = &ctx->shader->selector->info;
         ubyte *name, *index, *array_first;
         int first, param;
@@ -713,7 +714,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
                                                 LLVMValueRef vertex_index,
                                                 LLVMValueRef param_index)
  {
-       struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
         LLVMValueRef param_stride, constant16;
  
@@ -757,7 +758,7 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
                                         const struct tgsi_full_dst_register *dst,
                                         const struct tgsi_full_src_register *src)
  {
-       struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         struct tgsi_shader_info *info = &ctx->shader->selector->info;
         ubyte *name, *index, *array_first;
         struct tgsi_full_src_register reg;
@@ -821,7 +822,7 @@ static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
                                  LLVMValueRef base, bool readonly_memory)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMValueRef value, value2;
         LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
         LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
@@ -863,7 +864,7 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
                              LLVMValueRef dw_addr)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMValueRef value;
  
         if (swizzle == ~0) {
@@ -872,7 +873,7 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
  
-               return lp_build_gather_values(bld_base->base.gallivm, values,
+               return lp_build_gather_values(gallivm, values,
                                               TGSI_NUM_CHANNELS);
         }
  
@@ -904,7 +905,7 @@ static void lds_store(struct lp_build_tgsi_context *bld_base,
                       LLVMValueRef value)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
  
         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
                             LLVMConstInt(ctx->i32, swizzle, 0));
@@ -922,7 +923,7 @@ static LLVMValueRef fetch_input_tcs(
         struct si_shader_context *ctx = si_shader_context(bld_base);
         LLVMValueRef dw_addr, stride;
  
-       stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
+       stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 24, 8);
         dw_addr = get_tcs_in_current_patch_offset(ctx);
         dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
  
@@ -974,7 +975,7 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
                              LLVMValueRef dst[4])
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         const struct tgsi_full_dst_register *reg = &inst->Dst[0];
         const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
         unsigned chan_index;
@@ -1045,7 +1046,7 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
         }
  
         if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
-               LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
+               LLVMValueRef value = lp_build_gather_values(gallivm,
                                                             values, 4);
                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
                                             base, 0, 1, 0, true, false);
@@ -1058,11 +1059,10 @@ static LLVMValueRef fetch_input_gs(
         enum tgsi_opcode_type type,
         unsigned swizzle)
  {
-       struct lp_build_context *base = &bld_base->base;
         struct si_shader_context *ctx = si_shader_context(bld_base);
         struct si_shader *shader = ctx->shader;
         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
-       struct gallivm_state *gallivm = base->gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMValueRef vtx_offset, soffset;
         unsigned vtx_offset_param;
         struct tgsi_shader_info *info = &shader->selector->info;
@@ -1083,7 +1083,7 @@ static LLVMValueRef fetch_input_gs(
                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
                         values[chan] = fetch_input_gs(bld_base, reg, type, chan);
                 }
-               return lp_build_gather_values(bld_base->base.gallivm, values,
+               return lp_build_gather_values(gallivm, values,
                                               TGSI_NUM_CHANNELS);
         }
  
@@ -1174,9 +1174,7 @@ static void interp_fs_input(struct si_shader_context *ctx,
                             LLVMValueRef face,
                             LLVMValueRef result[4])
  {
-       struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
-       struct lp_build_context *base = &bld_base->base;
-       struct gallivm_state *gallivm = base->gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMValueRef attr_number;
         LLVMValueRef i, j;
  
@@ -1380,6 +1378,8 @@ static void declare_system_value(struct si_shader_context *ctx,
         struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMValueRef value = 0;
  
+       assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
+
         switch (decl->Semantic.Name) {
         case TGSI_SEMANTIC_INSTANCEID:
                 value = LLVMGetParam(ctx->main_fn,
@@ -1395,8 +1395,9 @@ static void declare_system_value(struct si_shader_context *ctx,
                 break;
  
         case TGSI_SEMANTIC_VERTEXID_NOBASE:
-               value = LLVMGetParam(ctx->main_fn,
-                                    ctx->param_vertex_id);
+               /* Unused. Clarify the meaning in indexed vs. non-indexed
+                * draws if this is ever used again. */
+               assert(false);
                 break;
  
         case TGSI_SEMANTIC_BASEVERTEX:
@@ -1588,6 +1589,46 @@ static void declare_system_value(struct si_shader_context *ctx,
                 }
                 break;
  
+       case TGSI_SEMANTIC_SUBGROUP_SIZE:
+               value = LLVMConstInt(ctx->i32, 64, 0);
+               break;
+
+       case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
+               value = ac_get_thread_id(&ctx->ac);
+               break;
+
+       case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
+       {
+               LLVMValueRef id = ac_get_thread_id(&ctx->ac);
+               id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
+               value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
+               value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
+               break;
+       }
+
+       case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
+       case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
+       case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
+       case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
+       {
+               LLVMValueRef id = ac_get_thread_id(&ctx->ac);
+               if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
+                   decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
+                       /* All bits set except LSB */
+                       value = LLVMConstInt(ctx->i64, -2, 0);
+               } else {
+                       /* All bits set */
+                       value = LLVMConstInt(ctx->i64, -1, 0);
+               }
+               id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
+               value = LLVMBuildShl(gallivm->builder, value, id, "");
+               if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
+                   decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
+                       value = LLVMBuildNot(gallivm->builder, value, "");
+               value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
+               break;
+       }
+
         default:
                 assert(!"unknown system value");
                 return;
@@ -1647,7 +1688,7 @@ static LLVMValueRef fetch_constant(
                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
                         values[chan] = fetch_constant(bld_base, reg, type, chan);
  
-               return lp_build_gather_values(bld_base->base.gallivm, values, 4);
+               return lp_build_gather_values(&ctx->gallivm, values, 4);
         }
  
         buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
@@ -1720,7 +1761,7 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
         struct lp_build_context *base = &bld_base->base;
-       LLVMBuilderRef builder = base->gallivm->builder;
+       LLVMBuilderRef builder = ctx->gallivm.builder;
         LLVMValueRef val[4];
         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
         unsigned chan;
@@ -1790,7 +1831,7 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
  
                         packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
                         args->out[chan] =
-                               LLVMBuildBitCast(base->gallivm->builder,
+                               LLVMBuildBitCast(ctx->gallivm.builder,
                                                  packed, ctx->f32, "");
                 }
                 break;
@@ -1929,7 +1970,7 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *
                                                   unsigned samplemask_param)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMValueRef coverage;
  
         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
@@ -2307,7 +2348,7 @@ handle_semantic:
                 if (shader->selector->info.writes_edgeflag) {
                         /* The output is a float, but the hw expects an integer
                          * with the first bit containing the edge flag. */
-                       edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
+                       edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
                                                          edgeflag_value,
                                                          ctx->i32, "");
                         edgeflag_value = lp_build_min(&bld_base->int_bld,
@@ -2315,7 +2356,7 @@ handle_semantic:
                                                       ctx->i32_1);
  
                         /* The LLVM intrinsic expects a float. */
-                       pos_args[1].out[1] = LLVMBuildBitCast(base->gallivm->builder,
+                       pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
                                                           edgeflag_value,
                                                           ctx->f32, "");
                 }
@@ -2354,7 +2395,7 @@ handle_semantic:
  static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
         LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
         uint64_t inputs;
@@ -2367,7 +2408,7 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
  
         buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
  
-       lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
+       lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 24, 8);
         lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
                                          lds_vertex_stride, "");
         lds_base = get_tcs_in_current_patch_offset(ctx);
@@ -2400,7 +2441,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                                   LLVMValueRef tcs_out_current_patch_data_offset)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         struct si_shader *shader = ctx->shader;
         unsigned tess_inner_index, tess_outer_index;
         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
@@ -2570,7 +2611,7 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
  
         /* Return epilog parameters from this function. */
-       LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+       LLVMBuilderRef builder = ctx->gallivm.builder;
         LLVMValueRef ret = ctx->return_value;
         LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
         unsigned vgpr;
@@ -2617,12 +2658,12 @@ static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
         struct si_shader_context *ctx = si_shader_context(bld_base);
         struct si_shader *shader = ctx->shader;
         struct tgsi_shader_info *info = &shader->selector->info;
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         unsigned i, chan;
         LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
                                               ctx->param_rel_auto_id);
         LLVMValueRef vertex_dw_stride =
-               unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
+               unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 24, 8);
         LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
                                                  vertex_dw_stride, "");
  
@@ -2646,7 +2687,7 @@ static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
  static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         struct si_shader *es = ctx->shader;
         struct tgsi_shader_info *info = &es->selector->info;
         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
@@ -2689,7 +2730,7 @@ static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
  static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         struct tgsi_shader_info *info = &ctx->shader->selector->info;
         struct si_shader_output_values *outputs = NULL;
         int i,j;
@@ -2822,7 +2863,7 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
                 if (stencil) {
                         /* Stencil should be in X[23:16]. */
                         stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
-                       stencil = LLVMBuildShl(base->gallivm->builder, stencil,
+                       stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
                                                LLVMConstInt(ctx->i32, 16, 0), "");
                         args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
                         mask |= 0x3;
@@ -2970,9 +3011,8 @@ static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
         struct si_shader *shader = ctx->shader;
-       struct lp_build_context *base = &bld_base->base;
         struct tgsi_shader_info *info = &shader->selector->info;
-       LLVMBuilderRef builder = base->gallivm->builder;
+       LLVMBuilderRef builder = ctx->gallivm.builder;
         unsigned i, j, first_vgpr, vgpr;
  
         LLVMValueRef color[8][4] = {};
@@ -3056,7 +3096,7 @@ static LLVMValueRef get_buffer_size(
         LLVMValueRef descriptor)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMBuilderRef builder = gallivm->builder;
         LLVMValueRef size =
                 LLVMBuildExtractElement(builder, descriptor,
@@ -3088,16 +3128,43 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
  /* Prevent optimizations (at least of memory accesses) across the current
   * point in the program by emitting empty inline assembly that is marked as
   * having side effects.
+ *
+ * Optionally, a value can be passed through the inline assembly to prevent
+ * LLVM from hoisting calls to ReadNone functions.
   */
-#if 0 /* unused currently */
-static void emit_optimization_barrier(struct si_shader_context *ctx)
+static void emit_optimization_barrier(struct si_shader_context *ctx,
+                                     LLVMValueRef *pvgpr)
  {
+       static int counter = 0;
+
         LLVMBuilderRef builder = ctx->gallivm.builder;
-       LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
-       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
-       LLVMBuildCall(builder, inlineasm, NULL, 0, "");
+       char code[16];
+
+       snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
+
+       if (!pvgpr) {
+               LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
+               LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
+               LLVMBuildCall(builder, inlineasm, NULL, 0, "");
+       } else {
+               LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
+               LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
+               LLVMValueRef vgpr = *pvgpr;
+               LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
+               unsigned vgpr_size = llvm_get_type_size(vgpr_type);
+               LLVMValueRef vgpr0;
+
+               assert(vgpr_size % 4 == 0);
+
+               vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
+               vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
+               vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
+               vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
+               vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
+
+               *pvgpr = vgpr;
+       }
  }
-#endif
  
  /* Combine these with & instead of |. */
  #define NOOP_WAITCNT 0xf7f
@@ -3303,7 +3370,7 @@ static LLVMValueRef image_fetch_coords(
                 unsigned src)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMBuilderRef builder = gallivm->builder;
         unsigned target = inst->Memory.Texture;
         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
@@ -3416,7 +3483,7 @@ static void load_fetch_args(
                 struct lp_build_emit_data * emit_data)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         const struct tgsi_full_instruction * inst = emit_data->inst;
         unsigned target = inst->Memory.Texture;
         LLVMValueRef rsrc;
@@ -3618,7 +3685,7 @@ static void load_emit(
                 struct lp_build_emit_data *emit_data)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMBuilderRef builder = gallivm->builder;
         const struct tgsi_full_instruction * inst = emit_data->inst;
         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
@@ -3671,7 +3738,7 @@ static void store_fetch_args(
                 struct lp_build_emit_data * emit_data)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMBuilderRef builder = gallivm->builder;
         const struct tgsi_full_instruction * inst = emit_data->inst;
         struct tgsi_full_src_register memory;
@@ -3835,7 +3902,7 @@ static void store_emit(
                 struct lp_build_emit_data *emit_data)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMBuilderRef builder = gallivm->builder;
         const struct tgsi_full_instruction * inst = emit_data->inst;
         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
@@ -3888,7 +3955,7 @@ static void atomic_fetch_args(
                 struct lp_build_emit_data * emit_data)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMBuilderRef builder = gallivm->builder;
         const struct tgsi_full_instruction * inst = emit_data->inst;
         LLVMValueRef data1, data2;
@@ -4016,7 +4083,7 @@ static void atomic_emit(
                 struct lp_build_emit_data *emit_data)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMBuilderRef builder = gallivm->builder;
         const struct tgsi_full_instruction * inst = emit_data->inst;
         char intrinsic_name[40];
@@ -4153,7 +4220,7 @@ static void resq_emit(
                 struct lp_build_emit_data *emit_data)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMBuilderRef builder = gallivm->builder;
         const struct tgsi_full_instruction *inst = emit_data->inst;
         LLVMValueRef out;
@@ -4354,7 +4421,7 @@ static void tex_fetch_args(
         struct lp_build_emit_data *emit_data)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         const struct tgsi_full_instruction *inst = emit_data->inst;
         unsigned opcode = inst->Instruction.Opcode;
         unsigned target = inst->Texture.Texture;
@@ -4880,7 +4947,7 @@ static void si_llvm_emit_txqs(
         struct lp_build_emit_data *emit_data)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMBuilderRef builder = gallivm->builder;
         LLVMValueRef res, samples;
         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
@@ -4909,7 +4976,7 @@ static void si_llvm_emit_ddxy(
         struct lp_build_emit_data *emit_data)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         unsigned opcode = emit_data->info->opcode;
         LLVMValueRef val;
         int idx;
@@ -4941,7 +5008,7 @@ static LLVMValueRef si_llvm_emit_ddxy_interp(
         LLVMValueRef interp_ij)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMValueRef result[4], a;
         unsigned i;
  
@@ -4960,7 +5027,7 @@ static void interp_fetch_args(
         struct lp_build_emit_data *emit_data)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         const struct tgsi_full_instruction *inst = emit_data->inst;
  
         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
@@ -5005,7 +5072,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
         struct si_shader *shader = ctx->shader;
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMValueRef interp_param;
         const struct tgsi_full_instruction *inst = emit_data->inst;
         int input_index = inst->Src[0].Register.Index;
@@ -5070,7 +5137,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
  
                         ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
                 }
-               interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
+               interp_param = lp_build_gather_values(gallivm, ij_out, 2);
         }
  
         for (chan = 0; chan < 4; chan++) {
@@ -5108,8 +5175,13 @@ static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
                 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
         };
  
-       if (LLVMTypeOf(value) != ctx->i32)
-               args[0] = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
+       /* We currently have no other way to prevent LLVM from lifting the icmp
+        * calls to a dominating basic block.
+        */
+       emit_optimization_barrier(ctx, &args[0]);
+
+       if (LLVMTypeOf(args[0]) != ctx->i32)
+               args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
  
         return lp_build_intrinsic(gallivm->builder,
                                   "llvm.amdgcn.icmp.i32",
@@ -5176,6 +5248,61 @@ static void vote_eq_emit(
                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
  }
  
+static void ballot_emit(
+       const struct lp_build_tgsi_action *action,
+       struct lp_build_tgsi_context *bld_base,
+       struct lp_build_emit_data *emit_data)
+{
+       struct si_shader_context *ctx = si_shader_context(bld_base);
+       LLVMBuilderRef builder = ctx->gallivm.builder;
+       LLVMValueRef tmp;
+
+       tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
+       tmp = si_emit_ballot(ctx, tmp);
+       tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
+
+       emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
+       emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
+}
+
+static void read_invoc_fetch_args(
+       struct lp_build_tgsi_context *bld_base,
+       struct lp_build_emit_data *emit_data)
+{
+       emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                                0, emit_data->src_chan);
+
+       /* Always read the source invocation (= lane) from the X channel. */
+       emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                                1, TGSI_CHAN_X);
+       emit_data->arg_count = 2;
+}
+
+static void read_lane_emit(
+       const struct lp_build_tgsi_action *action,
+       struct lp_build_tgsi_context *bld_base,
+       struct lp_build_emit_data *emit_data)
+{
+       struct si_shader_context *ctx = si_shader_context(bld_base);
+       LLVMBuilderRef builder = ctx->gallivm.builder;
+
+       /* We currently have no other way to prevent LLVM from lifting the icmp
+        * calls to a dominating basic block.
+        */
+       emit_optimization_barrier(ctx, &emit_data->args[0]);
+
+       for (unsigned i = 0; i < emit_data->arg_count; ++i) {
+               emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
+                                                     ctx->i32, "");
+       }
+
+       emit_data->output[emit_data->chan] =
+               ac_build_intrinsic(&ctx->ac, action->intr_name,
+                                  ctx->i32, emit_data->args, emit_data->arg_count,
+                                  AC_FUNC_ATTR_READNONE |
+                                  AC_FUNC_ATTR_CONVERGENT);
+}
+
  static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
                                        struct lp_build_emit_data *emit_data)
  {
@@ -5201,7 +5328,7 @@ static void si_llvm_emit_vertex(
         struct lp_build_context *uint = &bld_base->uint_bld;
         struct si_shader *shader = ctx->shader;
         struct tgsi_shader_info *info = &shader->selector->info;
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         struct lp_build_if_state if_state;
         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
                                             SI_PARAM_GS2VS_OFFSET);
@@ -5301,7 +5428,7 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
                                  struct lp_build_emit_data *emit_data)
  {
         struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
  
         /* SI only (thanks to a hw bug workaround):
          * The real barrier instruction isn’t needed, because an entire patch
@@ -5460,7 +5587,7 @@ static unsigned si_get_max_workgroup_size(struct si_shader *shader)
  static void create_function(struct si_shader_context *ctx)
  {
         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         struct si_shader *shader = ctx->shader;
         LLVMTypeRef params[SI_NUM_PARAMS + SI_MAX_ATTRIBS], v3i32;
         LLVMTypeRef returns[16+32*4];
@@ -5725,7 +5852,7 @@ static void create_function(struct si_shader_context *ctx)
   */
  static void preload_ring_buffers(struct si_shader_context *ctx)
  {
-       struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMBuilderRef builder = gallivm->builder;
  
         LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
@@ -5827,8 +5954,7 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
                                          LLVMValueRef param_rw_buffers,
                                          unsigned param_pos_fixed_pt)
  {
-       struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
-       struct gallivm_state *gallivm = bld_base->base.gallivm;
+       struct gallivm_state *gallivm = &ctx->gallivm;
         LLVMBuilderRef builder = gallivm->builder;
         LLVMValueRef slot, desc, offset, row, bit, address[2];
  
@@ -6443,14 +6569,14 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
         /* Dump LLVM IR before any optimization passes */
         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
             r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
-               ac_dump_module(bld_base->base.gallivm->module);
+               ac_dump_module(ctx.gallivm.module);
  
         si_llvm_finalize_module(&ctx,
                 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
  
         r = si_compile_llvm(sscreen, &ctx.shader->binary,
                             &ctx.shader->config, ctx.tm,
-                           bld_base->base.gallivm->module,
+                           ctx.gallivm.module,
                             debug, PIPE_SHADER_GEOMETRY,
                             "GS Copy Shader");
         if (!r) {
@@ -6625,6 +6751,12 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
         bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
         bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
         bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
+       bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
+       bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
+       bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
+       bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
+       bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
+       bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
  
         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
@@ -6866,7 +6998,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                 int i;
                 for (i = 0; i < 4; i++) {
                         ctx->gs_next_vertex[i] =
-                               lp_build_alloca(bld_base->base.gallivm,
+                               lp_build_alloca(&ctx->gallivm,
                                                 ctx->i32, "");
                 }
         }
@@ -7346,7 +7478,6 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
  {
         struct si_shader_selector *sel = shader->selector;
         struct si_shader_context ctx;
-       struct lp_build_tgsi_context *bld_base;
         LLVMModuleRef mod;
         int r = -1;
  
@@ -7366,7 +7497,6 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
  
         shader->info.uses_instanceid = sel->info.uses_instanceid;
  
-       bld_base = &ctx.bld_base;
         ctx.load_system_value = declare_system_value;
  
         if (!si_compile_tgsi_main(&ctx, shader)) {
@@ -7459,7 +7589,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0);
         }
  
-       mod = bld_base->base.gallivm->module;
+       mod = ctx.gallivm.module;
  
         /* Dump LLVM IR before any optimization passes */
         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&