radeonsi: rename r600_resource -> si_resource
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index 66fe5fad2185a52f1571bef593f2d6144365d588..a2ed899b58f4a51652f1bf586a6c0c02195cd230 100644 (file)
@@ -86,6 +86,8 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
                                        union si_shader_part_key *key);
 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
                                        union si_shader_part_key *key);
+static void si_fix_resource_usage(struct si_screen *sscreen,
+                                 struct si_shader *shader);
 
 /* Ideally pass the sample mask input to the PS epilog as v14, which
  * is its usual location, so that the shader doesn't have to add v_mov.
@@ -428,20 +430,6 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
        }
 }
 
-static LLVMValueRef get_instance_index_for_fetch(
-       struct si_shader_context *ctx,
-       unsigned param_start_instance, LLVMValueRef divisor)
-{
-       LLVMValueRef result = ctx->abi.instance_id;
-
-       /* The division must be done before START_INSTANCE is added. */
-       if (divisor != ctx->i32_1)
-               result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, "");
-
-       return LLVMBuildAdd(ctx->ac.builder, result,
-                           LLVMGetParam(ctx->main_fn, param_start_instance), "");
-}
-
 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
  * to float. */
 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
@@ -575,6 +563,14 @@ void si_llvm_load_input_vs(
 
        /* Do multiple loads for special formats. */
        switch (fix_fetch) {
+       case SI_FIX_FETCH_RG_64_FLOAT:
+               num_fetches = 1; /* 1 2-dword or 4-dword load */
+               fetch_stride = 0;
+               if (util_last_bit(info->input_usage_mask[input_index]) >= 2)
+                       num_channels = 4; /* 2 doubles in 4 dwords */
+               else
+                       num_channels = 2; /* 1 double in 2 dwords */
+               break;
        case SI_FIX_FETCH_RGB_64_FLOAT:
                num_fetches = 3; /* 3 2-dword loads */
                fetch_stride = 8;
@@ -1204,11 +1200,11 @@ static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
 static LLVMValueRef fetch_input_tcs(
        struct lp_build_tgsi_context *bld_base,
        const struct tgsi_full_src_register *reg,
-       enum tgsi_opcode_type type, unsigned swizzle)
+       enum tgsi_opcode_type type, unsigned swizzle_in)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        LLVMValueRef dw_addr, stride;
-
+       unsigned swizzle = swizzle_in & 0xffff;
        stride = get_tcs_in_vertex_dw_stride(ctx);
        dw_addr = get_tcs_in_current_patch_offset(ctx);
        dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
@@ -1289,10 +1285,11 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
 static LLVMValueRef fetch_output_tcs(
                struct lp_build_tgsi_context *bld_base,
                const struct tgsi_full_src_register *reg,
-               enum tgsi_opcode_type type, unsigned swizzle)
+               enum tgsi_opcode_type type, unsigned swizzle_in)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        LLVMValueRef dw_addr, stride;
+       unsigned swizzle = (swizzle_in & 0xffff);
 
        if (reg->Register.Dimension) {
                stride = get_tcs_out_vertex_dw_stride(ctx);
@@ -1309,10 +1306,11 @@ static LLVMValueRef fetch_output_tcs(
 static LLVMValueRef fetch_input_tes(
        struct lp_build_tgsi_context *bld_base,
        const struct tgsi_full_src_register *reg,
-       enum tgsi_opcode_type type, unsigned swizzle)
+       enum tgsi_opcode_type type, unsigned swizzle_in)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        LLVMValueRef base, addr;
+       unsigned swizzle = (swizzle_in & 0xffff);
 
        base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
        addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
@@ -1696,10 +1694,11 @@ static LLVMValueRef fetch_input_gs(
        struct lp_build_tgsi_context *bld_base,
        const struct tgsi_full_src_register *reg,
        enum tgsi_opcode_type type,
-       unsigned swizzle)
+       unsigned swizzle_in)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct tgsi_shader_info *info = &ctx->shader->selector->info;
+       unsigned swizzle = swizzle_in & 0xffff;
 
        unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
        if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
@@ -2267,6 +2266,10 @@ void si_load_system_value(struct si_shader_context *ctx,
                break;
        }
 
+       case TGSI_SEMANTIC_CS_USER_DATA:
+               value = LLVMGetParam(ctx->main_fn, ctx->param_cs_user_data);
+               break;
+
        default:
                assert(!"unknown system value");
                return;
@@ -2280,7 +2283,7 @@ void si_declare_compute_memory(struct si_shader_context *ctx)
        struct si_shader_selector *sel = ctx->shader->selector;
        unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
 
-       LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_LOCAL_ADDR_SPACE);
+       LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS);
        LLVMValueRef var;
 
        assert(!ctx->ac.lds);
@@ -2288,7 +2291,7 @@ void si_declare_compute_memory(struct si_shader_context *ctx)
        var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
                                          LLVMArrayType(ctx->i8, lds_size),
                                          "compute_lds",
-                                         AC_LOCAL_ADDR_SPACE);
+                                         AC_ADDR_SPACE_LDS);
        LLVMSetAlignment(var, 4);
 
        ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
@@ -2317,18 +2320,9 @@ static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *c
        ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
 
        LLVMValueRef desc0, desc1;
-       if (HAVE_32BIT_POINTERS) {
-               desc0 = ptr;
-               desc1 = LLVMConstInt(ctx->i32,
-                                    S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
-       } else {
-               ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, "");
-               desc0 = LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, "");
-               desc1 = LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, "");
-               /* Mask out all bits except BASE_ADDRESS_HI. */
-               desc1 = LLVMBuildAnd(ctx->ac.builder, desc1,
-                                    LLVMConstInt(ctx->i32, ~C_008F04_BASE_ADDRESS_HI, 0), "");
-       }
+       desc0 = ptr;
+       desc1 = LLVMConstInt(ctx->i32,
+                            S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
 
        LLVMValueRef desc_elems[] = {
                desc0,
@@ -2393,16 +2387,17 @@ static LLVMValueRef fetch_constant(
        struct lp_build_tgsi_context *bld_base,
        const struct tgsi_full_src_register *reg,
        enum tgsi_opcode_type type,
-       unsigned swizzle)
+       unsigned swizzle_in)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct si_shader_selector *sel = ctx->shader->selector;
        const struct tgsi_ind_register *ireg = &reg->Indirect;
        unsigned buf, idx;
+       unsigned swizzle = swizzle_in & 0xffff;
 
        LLVMValueRef addr, bufp;
 
-       if (swizzle == LP_CHAN_ALL) {
+       if (swizzle_in == LP_CHAN_ALL) {
                unsigned chan;
                LLVMValueRef values[4];
                for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
@@ -2416,7 +2411,7 @@ static LLVMValueRef fetch_constant(
                LLVMValueRef lo, hi;
 
                lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle);
-               hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle + 1);
+               hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, (swizzle_in >> 16));
                return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type),
                                                lo, hi);
        }
@@ -2601,7 +2596,7 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
                        LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
                ac_build_kill_if_false(&ctx->ac, alpha_pass);
        } else {
-               ac_build_kill_if_false(&ctx->ac, LLVMConstInt(ctx->i1, 0, 0));
+               ac_build_kill_if_false(&ctx->ac, ctx->i1false);
        }
 }
 
@@ -2724,12 +2719,10 @@ static void emit_streamout_output(struct si_shader_context *ctx,
                break;
        case 2: /* as v2i32 */
        case 3: /* as v4i32 (aligned to 4) */
+               out[3] = LLVMGetUndef(ctx->i32);
+               /* fall through */
        case 4: /* as v4i32 */
-               vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
-               for (int j = 0; j < num_comps; j++) {
-                       vdata = LLVMBuildInsertElement(ctx->ac.builder, vdata, out[j],
-                                                      LLVMConstInt(ctx->i32, j, 0), "");
-               }
+               vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
                break;
        }
 
@@ -3271,21 +3264,9 @@ si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
                    unsigned param, unsigned return_index)
 {
        LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef ptr, lo, hi;
-
-       if (HAVE_32BIT_POINTERS) {
-               ptr = LLVMGetParam(ctx->main_fn, param);
-               ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, "");
-               return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
-       }
-
-       ptr = LLVMGetParam(ctx->main_fn, param);
-       ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
-       ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
-       lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
-       hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
-       ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
-       return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
+       LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, param);
+       ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, "");
+       return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
 }
 
 /* This only writes the tessellation factor levels. */
@@ -3386,8 +3367,7 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
        LLVMValueRef ret = ctx->return_value;
 
        ret = si_insert_input_ptr(ctx, ret, 0, 0);
-       if (HAVE_32BIT_POINTERS)
-               ret = si_insert_input_ptr(ctx, ret, 1, 1);
+       ret = si_insert_input_ptr(ctx, ret, 1, 1);
        ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
        ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
        ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
@@ -3402,11 +3382,6 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
        ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
                                  8 + SI_SGPR_VS_STATE_BITS);
 
-#if !HAVE_32BIT_POINTERS
-       ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 4,
-                                 8 + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES);
-#endif
-
        ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
                                  8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
        ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
@@ -3430,8 +3405,7 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
        LLVMValueRef ret = ctx->return_value;
 
        ret = si_insert_input_ptr(ctx, ret, 0, 0);
-       if (HAVE_32BIT_POINTERS)
-               ret = si_insert_input_ptr(ctx, ret, 1, 1);
+       ret = si_insert_input_ptr(ctx, ret, 1, 1);
        ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
        ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
        ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
@@ -3442,11 +3416,6 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
                                  ctx->param_bindless_samplers_and_images,
                                  8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
 
-#if !HAVE_32BIT_POINTERS
-       ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 4,
-                                 8 + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES);
-#endif
-
        unsigned vgpr;
        if (ctx->type == PIPE_SHADER_VERTEX)
                vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
@@ -4320,9 +4289,12 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
        gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, "");
        LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
 
-       /* Signal vertex emission */
-       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
-                        si_get_gs_wave_id(ctx));
+       /* Signal vertex emission if vertex data was written. */
+       if (offset) {
+               ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
+                                si_get_gs_wave_id(ctx));
+       }
+
        if (!use_kill)
                lp_build_endif(&if_state);
 }
@@ -4589,6 +4561,30 @@ static void declare_vs_input_vgprs(struct si_shader_context *ctx,
        }
 }
 
+static void declare_vs_blit_inputs(struct si_shader_context *ctx,
+                                  struct si_function_info *fninfo,
+                                  unsigned vs_blit_property)
+{
+       ctx->param_vs_blit_inputs = fninfo->num_params;
+       add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */
+       add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */
+       add_arg(fninfo, ARG_SGPR, ctx->f32); /* depth */
+
+       if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* color0 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* color1 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* color2 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* color3 */
+       } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */
+       }
+}
+
 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
                                    struct si_function_info *fninfo)
 {
@@ -4633,24 +4629,7 @@ static void create_function(struct si_shader_context *ctx)
                declare_global_desc_pointers(ctx, &fninfo);
 
                if (vs_blit_property) {
-                       ctx->param_vs_blit_inputs = fninfo.num_params;
-                       add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */
-                       add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */
-                       add_arg(&fninfo, ARG_SGPR, ctx->f32); /* depth */
-
-                       if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color0 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color1 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color2 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color3 */
-                       } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */
-                       }
+                       declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property);
 
                        /* VGPRs */
                        declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
@@ -4707,13 +4686,8 @@ static void create_function(struct si_shader_context *ctx)
        case SI_SHADER_MERGED_VERTEX_TESSCTRL:
                /* Merged stages have 8 system SGPRs at the beginning. */
                /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
-               if (HAVE_32BIT_POINTERS) {
-                       declare_per_stage_desc_pointers(ctx, &fninfo,
-                                                       ctx->type == PIPE_SHADER_TESS_CTRL);
-               } else {
-                       declare_const_and_shader_buffers(ctx, &fninfo,
-                                                        ctx->type == PIPE_SHADER_TESS_CTRL);
-               }
+               declare_per_stage_desc_pointers(ctx, &fninfo,
+                                               ctx->type == PIPE_SHADER_TESS_CTRL);
                ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -4726,15 +4700,9 @@ static void create_function(struct si_shader_context *ctx)
                                                ctx->type == PIPE_SHADER_VERTEX);
                declare_vs_specific_input_sgprs(ctx, &fninfo);
 
-               if (!HAVE_32BIT_POINTERS) {
-                       declare_samplers_and_images(ctx, &fninfo,
-                                                   ctx->type == PIPE_SHADER_TESS_CTRL);
-               }
                ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
-               if (!HAVE_32BIT_POINTERS) /* Align to 2 dwords. */
-                       add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
                ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
                        ac_array_in_const32_addr_space(ctx->v4i32));
 
@@ -4768,13 +4736,8 @@ static void create_function(struct si_shader_context *ctx)
        case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
                /* Merged stages have 8 system SGPRs at the beginning. */
                /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
-               if (HAVE_32BIT_POINTERS) {
-                       declare_per_stage_desc_pointers(ctx, &fninfo,
-                                                       ctx->type == PIPE_SHADER_GEOMETRY);
-               } else {
-                       declare_const_and_shader_buffers(ctx, &fninfo,
-                                                        ctx->type == PIPE_SHADER_GEOMETRY);
-               }
+               declare_per_stage_desc_pointers(ctx, &fninfo,
+                                               ctx->type == PIPE_SHADER_GEOMETRY);
                ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -4793,14 +4756,8 @@ static void create_function(struct si_shader_context *ctx)
                        ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                        ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                        /* Declare as many input SGPRs as the VS has. */
-                       if (!HAVE_32BIT_POINTERS)
-                               add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
                }
 
-               if (!HAVE_32BIT_POINTERS) {
-                       declare_samplers_and_images(ctx, &fninfo,
-                                                   ctx->type == PIPE_SHADER_GEOMETRY);
-               }
                if (ctx->type == PIPE_SHADER_VERTEX) {
                        ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
                                ac_array_in_const32_addr_space(ctx->v4i32));
@@ -4950,6 +4907,13 @@ static void create_function(struct si_shader_context *ctx)
                    shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
                        ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
 
+               unsigned cs_user_data_dwords =
+                       shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_DWORDS];
+               if (cs_user_data_dwords) {
+                       ctx->param_cs_user_data = add_arg(&fninfo, ARG_SGPR,
+                                                         LLVMVectorType(ctx->i32, cs_user_data_dwords));
+               }
+
                for (i = 0; i < 3; i++) {
                        ctx->abi.workgroup_ids[i] = NULL;
                        if (shader->selector->info.uses_block_id[i])
@@ -5278,7 +5242,7 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
               !mainb->rodata_size);
        assert(!epilog || !epilog->rodata_size);
 
-       r600_resource_reference(&shader->bo, NULL);
+       si_resource_reference(&shader->bo, NULL);
        shader->bo = si_aligned_buffer_create(&sscreen->b,
                                              sscreen->cpdma_prefetch_writes_memory ?
                                                0 : SI_RESOURCE_FLAG_READ_ONLY,
@@ -5291,7 +5255,8 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
        /* Upload. */
        ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
                                        PIPE_TRANSFER_READ_WRITE |
-                                       PIPE_TRANSFER_UNSYNCHRONIZED);
+                                       PIPE_TRANSFER_UNSYNCHRONIZED |
+                                       RADEON_TRANSFER_TEMPORARY);
 
        /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
         * endian-independent. */
@@ -5826,6 +5791,8 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
        if (r != 0) {
                FREE(shader);
                shader = NULL;
+       } else {
+               si_fix_resource_usage(sscreen, shader);
        }
        return shader;
 }
@@ -6141,7 +6108,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
        if (sel->force_correct_derivs_after_kill) {
                ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
                /* true = don't kill. */
-               LLVMBuildStore(ctx->ac.builder, LLVMConstInt(ctx->i1, 1, 0),
+               LLVMBuildStore(ctx->ac.builder, ctx->i1true,
                               ctx->postponed_kill);
        }
 
@@ -6653,7 +6620,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
                        if (LLVMTypeOf(arg) != param_type) {
                                if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
                                        if (LLVMGetPointerAddressSpace(param_type) ==
-                                           AC_CONST_32BIT_ADDR_SPACE) {
+                                           AC_ADDR_SPACE_CONST_32BIT) {
                                                arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
                                                arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
                                        } else {
@@ -7154,20 +7121,9 @@ static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
        LLVMValueRef ptr[2], list;
        bool merged_shader = is_merged_shader(ctx);
 
-       if (HAVE_32BIT_POINTERS) {
-               ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
-               list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
-                                        ac_array_in_const32_addr_space(ctx->v4i32), "");
-               return list;
-       }
-
-       /* Get the pointer to rw buffers. */
        ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
-       ptr[1] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS + 1);
-       list = ac_build_gather_values(&ctx->ac, ptr, 2);
-       list = LLVMBuildBitCast(ctx->ac.builder, list, ctx->i64, "");
-       list = LLVMBuildIntToPtr(ctx->ac.builder, list,
-                                ac_array_in_const_addr_space(ctx->v4i32), "");
+       list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
+                                ac_array_in_const32_addr_space(ctx->v4i32), "");
        return list;
 }
 
@@ -7286,22 +7242,32 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                        key->vs_prolog.states.instance_divisor_is_one & (1u << i);
                bool divisor_is_fetched =
                        key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
-               LLVMValueRef index;
-
-               if (divisor_is_one || divisor_is_fetched) {
-                       LLVMValueRef divisor = ctx->i32_1;
-
-                       if (divisor_is_fetched) {
-                               divisor = buffer_load_const(ctx, instance_divisor_constbuf,
-                                                           LLVMConstInt(ctx->i32, i * 4, 0));
-                               divisor = ac_to_integer(&ctx->ac, divisor);
+               LLVMValueRef index = NULL;
+
+               if (divisor_is_one) {
+                       index = ctx->abi.instance_id;
+               } else if (divisor_is_fetched) {
+                       LLVMValueRef udiv_factors[4];
+
+                       for (unsigned j = 0; j < 4; j++) {
+                               udiv_factors[j] =
+                                       buffer_load_const(ctx, instance_divisor_constbuf,
+                                                         LLVMConstInt(ctx->i32, i*16 + j*4, 0));
+                               udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
                        }
+                       /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
+                        * Such InstanceID might not be achievable in a reasonable time though.
+                        */
+                       index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
+                                                      udiv_factors[0], udiv_factors[1],
+                                                      udiv_factors[2], udiv_factors[3]);
+               }
 
-                       /* InstanceID / Divisor + StartInstance */
-                       index = get_instance_index_for_fetch(ctx,
-                                                            user_sgpr_base +
-                                                            SI_SGPR_START_INSTANCE,
-                                                            divisor);
+               if (divisor_is_one || divisor_is_fetched) {
+                       /* Add StartInstance. */
+                       index = LLVMBuildAdd(ctx->ac.builder, index,
+                                            LLVMGetParam(ctx->main_fn, user_sgpr_base +
+                                                         SI_SGPR_START_INSTANCE), "");
                } else {
                        /* VertexID + BaseVertex */
                        index = LLVMBuildAdd(ctx->ac.builder,
@@ -7385,8 +7351,6 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
                add_arg(&fninfo, ARG_SGPR, ctx->i32);
                add_arg(&fninfo, ARG_SGPR, ctx->i32);
                add_arg(&fninfo, ARG_SGPR, ctx->i32);
-               if (!HAVE_32BIT_POINTERS)
-                       add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
                ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -8154,9 +8118,9 @@ int si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compile
 void si_shader_destroy(struct si_shader *shader)
 {
        if (shader->scratch_bo)
-               r600_resource_reference(&shader->scratch_bo, NULL);
+               si_resource_reference(&shader->scratch_bo, NULL);
 
-       r600_resource_reference(&shader->bo, NULL);
+       si_resource_reference(&shader->bo, NULL);
 
        if (!shader->is_binary_shared)
                ac_shader_binary_clean(&shader->binary);