radeonsi: rename r600_resource -> si_resource
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index 551671f40213a5319388d932bcfa998012640b30..a2ed899b58f4a51652f1bf586a6c0c02195cd230 100644 (file)
@@ -86,6 +86,8 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
                                        union si_shader_part_key *key);
 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
                                        union si_shader_part_key *key);
+static void si_fix_resource_usage(struct si_screen *sscreen,
+                                 struct si_shader *shader);
 
 /* Ideally pass the sample mask input to the PS epilog as v14, which
  * is its usual location, so that the shader doesn't have to add v_mov.
@@ -101,15 +103,15 @@ static bool llvm_type_is_64bit(struct si_shader_context *ctx,
        return false;
 }
 
-static bool is_merged_shader(struct si_shader *shader)
+static bool is_merged_shader(struct si_shader_context *ctx)
 {
-       if (shader->selector->screen->info.chip_class <= VI)
+       if (ctx->screen->info.chip_class <= VI)
                return false;
 
-       return shader->key.as_ls ||
-              shader->key.as_es ||
-              shader->selector->type == PIPE_SHADER_TESS_CTRL ||
-              shader->selector->type == PIPE_SHADER_GEOMETRY;
+       return ctx->shader->key.as_ls ||
+              ctx->shader->key.as_es ||
+              ctx->type == PIPE_SHADER_TESS_CTRL ||
+              ctx->type == PIPE_SHADER_GEOMETRY;
 }
 
 static void si_init_function_info(struct si_function_info *fninfo)
@@ -378,10 +380,7 @@ get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
        LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
        LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 
-       return LLVMBuildAdd(ctx->ac.builder, patch0_offset,
-                           LLVMBuildMul(ctx->ac.builder, patch_stride,
-                                        rel_patch_id, ""),
-                           "");
+       return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
 }
 
 static LLVMValueRef
@@ -392,10 +391,7 @@ get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
        LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
        LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 
-       return LLVMBuildAdd(ctx->ac.builder, patch0_patch_data_offset,
-                           LLVMBuildMul(ctx->ac.builder, patch_stride,
-                                        rel_patch_id, ""),
-                           "");
+       return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
 }
 
 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
@@ -434,20 +430,6 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
        }
 }
 
-static LLVMValueRef get_instance_index_for_fetch(
-       struct si_shader_context *ctx,
-       unsigned param_start_instance, LLVMValueRef divisor)
-{
-       LLVMValueRef result = ctx->abi.instance_id;
-
-       /* The division must be done before START_INSTANCE is added. */
-       if (divisor != ctx->i32_1)
-               result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, "");
-
-       return LLVMBuildAdd(ctx->ac.builder, result,
-                           LLVMGetParam(ctx->main_fn, param_start_instance), "");
-}
-
 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
  * to float. */
 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
@@ -581,6 +563,14 @@ void si_llvm_load_input_vs(
 
        /* Do multiple loads for special formats. */
        switch (fix_fetch) {
+       case SI_FIX_FETCH_RG_64_FLOAT:
+               num_fetches = 1; /* 1 2-dword or 4-dword load */
+               fetch_stride = 0;
+               if (util_last_bit(info->input_usage_mask[input_index]) >= 2)
+                       num_channels = 4; /* 2 doubles in 4 dwords */
+               else
+                       num_channels = 2; /* 1 double in 2 dwords */
+               break;
        case SI_FIX_FETCH_RGB_64_FLOAT:
                num_fetches = 3; /* 3 2-dword loads */
                fetch_stride = 8;
@@ -815,12 +805,8 @@ LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
                result = ac_to_integer(&ctx->ac, result);
        }
 
-       if (addr_mul != 1)
-               result = LLVMBuildMul(ctx->ac.builder, result,
-                                     LLVMConstInt(ctx->i32, addr_mul, 0), "");
-       result = LLVMBuildAdd(ctx->ac.builder, result,
-                             LLVMConstInt(ctx->i32, rel_index, 0), "");
-       return result;
+       return ac_build_imad(&ctx->ac, result, LLVMConstInt(ctx->i32, addr_mul, 0),
+                            LLVMConstInt(ctx->i32, rel_index, 0));
 }
 
 /**
@@ -847,15 +833,13 @@ static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context
                                                        bool is_patch)
 {
        if (vertex_dw_stride) {
-               base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
-                                        LLVMBuildMul(ctx->ac.builder, vertex_index,
-                                                     vertex_dw_stride, ""), "");
+               base_addr = ac_build_imad(&ctx->ac, vertex_index,
+                                         vertex_dw_stride, base_addr);
        }
 
        if (param_index) {
-               base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
-                                        LLVMBuildMul(ctx->ac.builder, param_index,
-                                                     LLVMConstInt(ctx->i32, 4, 0), ""), "");
+               base_addr = ac_build_imad(&ctx->ac, param_index,
+                                         LLVMConstInt(ctx->i32, 4, 0), base_addr);
        }
 
        int param = is_patch ?
@@ -975,22 +959,15 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 
        constant16 = LLVMConstInt(ctx->i32, 16, 0);
        if (vertex_index) {
-               base_addr = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
-                                        vertices_per_patch, "");
-
-               base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
-                                        vertex_index, "");
-
+               base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
+                                         vertices_per_patch, vertex_index);
                param_stride = total_vertices;
        } else {
                base_addr = rel_patch_id;
                param_stride = num_patches;
        }
 
-       base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
-                                LLVMBuildMul(ctx->ac.builder, param_index,
-                                             param_stride, ""), "");
-
+       base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
        base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
 
        if (!vertex_index) {
@@ -1223,11 +1200,11 @@ static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
 static LLVMValueRef fetch_input_tcs(
        struct lp_build_tgsi_context *bld_base,
        const struct tgsi_full_src_register *reg,
-       enum tgsi_opcode_type type, unsigned swizzle)
+       enum tgsi_opcode_type type, unsigned swizzle_in)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        LLVMValueRef dw_addr, stride;
-
+       unsigned swizzle = swizzle_in & 0xffff;
        stride = get_tcs_in_vertex_dw_stride(ctx);
        dw_addr = get_tcs_in_current_patch_offset(ctx);
        dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
@@ -1308,10 +1285,11 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
 static LLVMValueRef fetch_output_tcs(
                struct lp_build_tgsi_context *bld_base,
                const struct tgsi_full_src_register *reg,
-               enum tgsi_opcode_type type, unsigned swizzle)
+               enum tgsi_opcode_type type, unsigned swizzle_in)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        LLVMValueRef dw_addr, stride;
+       unsigned swizzle = (swizzle_in & 0xffff);
 
        if (reg->Register.Dimension) {
                stride = get_tcs_out_vertex_dw_stride(ctx);
@@ -1328,10 +1306,11 @@ static LLVMValueRef fetch_output_tcs(
 static LLVMValueRef fetch_input_tes(
        struct lp_build_tgsi_context *bld_base,
        const struct tgsi_full_src_register *reg,
-       enum tgsi_opcode_type type, unsigned swizzle)
+       enum tgsi_opcode_type type, unsigned swizzle_in)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        LLVMValueRef base, addr;
+       unsigned swizzle = (swizzle_in & 0xffff);
 
        base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
        addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
@@ -1715,10 +1694,11 @@ static LLVMValueRef fetch_input_gs(
        struct lp_build_tgsi_context *bld_base,
        const struct tgsi_full_src_register *reg,
        enum tgsi_opcode_type type,
-       unsigned swizzle)
+       unsigned swizzle_in)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct tgsi_shader_info *info = &ctx->shader->selector->info;
+       unsigned swizzle = swizzle_in & 0xffff;
 
        unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
        if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
@@ -2286,6 +2266,10 @@ void si_load_system_value(struct si_shader_context *ctx,
                break;
        }
 
+       case TGSI_SEMANTIC_CS_USER_DATA:
+               value = LLVMGetParam(ctx->main_fn, ctx->param_cs_user_data);
+               break;
+
        default:
                assert(!"unknown system value");
                return;
@@ -2299,7 +2283,7 @@ void si_declare_compute_memory(struct si_shader_context *ctx)
        struct si_shader_selector *sel = ctx->shader->selector;
        unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
 
-       LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_LOCAL_ADDR_SPACE);
+       LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS);
        LLVMValueRef var;
 
        assert(!ctx->ac.lds);
@@ -2307,7 +2291,7 @@ void si_declare_compute_memory(struct si_shader_context *ctx)
        var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
                                          LLVMArrayType(ctx->i8, lds_size),
                                          "compute_lds",
-                                         AC_LOCAL_ADDR_SPACE);
+                                         AC_ADDR_SPACE_LDS);
        LLVMSetAlignment(var, 4);
 
        ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
@@ -2336,18 +2320,9 @@ static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *c
        ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
 
        LLVMValueRef desc0, desc1;
-       if (HAVE_32BIT_POINTERS) {
-               desc0 = ptr;
-               desc1 = LLVMConstInt(ctx->i32,
-                                    S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
-       } else {
-               ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, "");
-               desc0 = LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, "");
-               desc1 = LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, "");
-               /* Mask out all bits except BASE_ADDRESS_HI. */
-               desc1 = LLVMBuildAnd(ctx->ac.builder, desc1,
-                                    LLVMConstInt(ctx->i32, ~C_008F04_BASE_ADDRESS_HI, 0), "");
-       }
+       desc0 = ptr;
+       desc1 = LLVMConstInt(ctx->i32,
+                            S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
 
        LLVMValueRef desc_elems[] = {
                desc0,
@@ -2412,16 +2387,17 @@ static LLVMValueRef fetch_constant(
        struct lp_build_tgsi_context *bld_base,
        const struct tgsi_full_src_register *reg,
        enum tgsi_opcode_type type,
-       unsigned swizzle)
+       unsigned swizzle_in)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct si_shader_selector *sel = ctx->shader->selector;
        const struct tgsi_ind_register *ireg = &reg->Indirect;
        unsigned buf, idx;
+       unsigned swizzle = swizzle_in & 0xffff;
 
        LLVMValueRef addr, bufp;
 
-       if (swizzle == LP_CHAN_ALL) {
+       if (swizzle_in == LP_CHAN_ALL) {
                unsigned chan;
                LLVMValueRef values[4];
                for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
@@ -2435,7 +2411,7 @@ static LLVMValueRef fetch_constant(
                LLVMValueRef lo, hi;
 
                lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle);
-               hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle + 1);
+               hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, (swizzle_in >> 16));
                return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type),
                                                lo, hi);
        }
@@ -2620,7 +2596,7 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
                        LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
                ac_build_kill_if_false(&ctx->ac, alpha_pass);
        } else {
-               ac_build_kill_if_false(&ctx->ac, LLVMConstInt(ctx->i1, 0, 0));
+               ac_build_kill_if_false(&ctx->ac, ctx->i1false);
        }
 }
 
@@ -2678,10 +2654,8 @@ static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
                                                                const_chan) * 4, 0);
                                base_elt = buffer_load_const(ctx, const_resource,
                                                             addr);
-                               args->out[chan] =
-                                       LLVMBuildFAdd(ctx->ac.builder, args->out[chan],
-                                                     LLVMBuildFMul(ctx->ac.builder, base_elt,
-                                                                   out_elts[const_chan], ""), "");
+                               args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
+                                                               out_elts[const_chan], args->out[chan]);
                        }
                }
 
@@ -2745,12 +2719,10 @@ static void emit_streamout_output(struct si_shader_context *ctx,
                break;
        case 2: /* as v2i32 */
        case 3: /* as v4i32 (aligned to 4) */
+               out[3] = LLVMGetUndef(ctx->i32);
+               /* fall through */
        case 4: /* as v4i32 */
-               vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
-               for (int j = 0; j < num_comps; j++) {
-                       vdata = LLVMBuildInsertElement(ctx->ac.builder, vdata, out[j],
-                                                      LLVMConstInt(ctx->i32, j, 0), "");
-               }
+               vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
                break;
        }
 
@@ -2823,9 +2795,9 @@ static void si_llvm_emit_streamout(struct si_shader_context *ctx,
                                                              ctx->param_streamout_offset[i]);
                        so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
 
-                       so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
-                                                         LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
-                       so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
+                       so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
+                                                          LLVMConstInt(ctx->i32, so->stride[i]*4, 0),
+                                                          so_offset);
                }
 
                /* Write streamout data. */
@@ -3058,7 +3030,7 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        LLVMValueRef invocation_id, buffer, buffer_offset;
-       LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
+       LLVMValueRef lds_vertex_stride, lds_base;
        uint64_t inputs;
 
        invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
@@ -3066,10 +3038,9 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
        buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
 
        lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
-       lds_vertex_offset = LLVMBuildMul(ctx->ac.builder, invocation_id,
-                                        lds_vertex_stride, "");
        lds_base = get_tcs_in_current_patch_offset(ctx);
-       lds_base = LLVMBuildAdd(ctx->ac.builder, lds_base, lds_vertex_offset, "");
+       lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride,
+                                lds_base);
 
        inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
        while (inputs) {
@@ -3293,21 +3264,9 @@ si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
                    unsigned param, unsigned return_index)
 {
        LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef ptr, lo, hi;
-
-       if (HAVE_32BIT_POINTERS) {
-               ptr = LLVMGetParam(ctx->main_fn, param);
-               ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, "");
-               return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
-       }
-
-       ptr = LLVMGetParam(ctx->main_fn, param);
-       ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
-       ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
-       lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
-       hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
-       ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
-       return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
+       LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, param);
+       ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, "");
+       return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
 }
 
 /* This only writes the tessellation factor levels. */
@@ -3408,8 +3367,7 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
        LLVMValueRef ret = ctx->return_value;
 
        ret = si_insert_input_ptr(ctx, ret, 0, 0);
-       if (HAVE_32BIT_POINTERS)
-               ret = si_insert_input_ptr(ctx, ret, 1, 1);
+       ret = si_insert_input_ptr(ctx, ret, 1, 1);
        ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
        ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
        ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
@@ -3424,11 +3382,6 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
        ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
                                  8 + SI_SGPR_VS_STATE_BITS);
 
-#if !HAVE_32BIT_POINTERS
-       ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 4,
-                                 8 + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES);
-#endif
-
        ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
                                  8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
        ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
@@ -3452,8 +3405,7 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
        LLVMValueRef ret = ctx->return_value;
 
        ret = si_insert_input_ptr(ctx, ret, 0, 0);
-       if (HAVE_32BIT_POINTERS)
-               ret = si_insert_input_ptr(ctx, ret, 1, 1);
+       ret = si_insert_input_ptr(ctx, ret, 1, 1);
        ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
        ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
        ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
@@ -3464,11 +3416,6 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
                                  ctx->param_bindless_samplers_and_images,
                                  8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
 
-#if !HAVE_32BIT_POINTERS
-       ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 4,
-                                 8 + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES);
-#endif
-
        unsigned vgpr;
        if (ctx->type == PIPE_SHADER_VERTEX)
                vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
@@ -4134,17 +4081,12 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
                                                                      ddxy_out, iy_ll, "");
                        LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder,
                                                                         interp_param, ix_ll, "");
-                       LLVMValueRef temp1, temp2;
+                       LLVMValueRef temp;
 
                        interp_el = ac_to_float(&ctx->ac, interp_el);
 
-                       temp1 = LLVMBuildFMul(ctx->ac.builder, ddx_el, offset_x, "");
-
-                       temp1 = LLVMBuildFAdd(ctx->ac.builder, temp1, interp_el, "");
-
-                       temp2 = LLVMBuildFMul(ctx->ac.builder, ddy_el, offset_y, "");
-
-                       ij_out[i] = LLVMBuildFAdd(ctx->ac.builder, temp2, temp1, "");
+                       temp = ac_build_fmad(&ctx->ac, ddx_el, offset_x, interp_el);
+                       ij_out[i] = ac_build_fmad(&ctx->ac, ddy_el, offset_y, temp);
                }
                interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2);
        }
@@ -4347,9 +4289,12 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
        gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, "");
        LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
 
-       /* Signal vertex emission */
-       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
-                        si_get_gs_wave_id(ctx));
+       /* Signal vertex emission if vertex data was written. */
+       if (offset) {
+               ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
+                                si_get_gs_wave_id(ctx));
+       }
+
        if (!use_kill)
                lp_build_endif(&if_state);
 }
@@ -4404,9 +4349,7 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
                return;
        }
 
-       ac_build_intrinsic(&ctx->ac,
-                          "llvm.amdgcn.s.barrier",
-                          ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
+       ac_build_s_barrier(&ctx->ac);
 }
 
 static void si_create_function(struct si_shader_context *ctx,
@@ -4618,6 +4561,30 @@ static void declare_vs_input_vgprs(struct si_shader_context *ctx,
        }
 }
 
+static void declare_vs_blit_inputs(struct si_shader_context *ctx,
+                                  struct si_function_info *fninfo,
+                                  unsigned vs_blit_property)
+{
+       ctx->param_vs_blit_inputs = fninfo->num_params;
+       add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */
+       add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */
+       add_arg(fninfo, ARG_SGPR, ctx->f32); /* depth */
+
+       if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* color0 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* color1 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* color2 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* color3 */
+       } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */
+               add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */
+       }
+}
+
 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
                                    struct si_function_info *fninfo)
 {
@@ -4662,24 +4629,7 @@ static void create_function(struct si_shader_context *ctx)
                declare_global_desc_pointers(ctx, &fninfo);
 
                if (vs_blit_property) {
-                       ctx->param_vs_blit_inputs = fninfo.num_params;
-                       add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */
-                       add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */
-                       add_arg(&fninfo, ARG_SGPR, ctx->f32); /* depth */
-
-                       if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color0 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color1 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color2 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color3 */
-                       } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */
-                               add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */
-                       }
+                       declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property);
 
                        /* VGPRs */
                        declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
@@ -4736,13 +4686,8 @@ static void create_function(struct si_shader_context *ctx)
        case SI_SHADER_MERGED_VERTEX_TESSCTRL:
                /* Merged stages have 8 system SGPRs at the beginning. */
                /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
-               if (HAVE_32BIT_POINTERS) {
-                       declare_per_stage_desc_pointers(ctx, &fninfo,
-                                                       ctx->type == PIPE_SHADER_TESS_CTRL);
-               } else {
-                       declare_const_and_shader_buffers(ctx, &fninfo,
-                                                        ctx->type == PIPE_SHADER_TESS_CTRL);
-               }
+               declare_per_stage_desc_pointers(ctx, &fninfo,
+                                               ctx->type == PIPE_SHADER_TESS_CTRL);
                ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -4755,15 +4700,9 @@ static void create_function(struct si_shader_context *ctx)
                                                ctx->type == PIPE_SHADER_VERTEX);
                declare_vs_specific_input_sgprs(ctx, &fninfo);
 
-               if (!HAVE_32BIT_POINTERS) {
-                       declare_samplers_and_images(ctx, &fninfo,
-                                                   ctx->type == PIPE_SHADER_TESS_CTRL);
-               }
                ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
-               if (!HAVE_32BIT_POINTERS) /* Align to 2 dwords. */
-                       add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
                ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
                        ac_array_in_const32_addr_space(ctx->v4i32));
 
@@ -4797,13 +4736,8 @@ static void create_function(struct si_shader_context *ctx)
        case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
                /* Merged stages have 8 system SGPRs at the beginning. */
                /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
-               if (HAVE_32BIT_POINTERS) {
-                       declare_per_stage_desc_pointers(ctx, &fninfo,
-                                                       ctx->type == PIPE_SHADER_GEOMETRY);
-               } else {
-                       declare_const_and_shader_buffers(ctx, &fninfo,
-                                                        ctx->type == PIPE_SHADER_GEOMETRY);
-               }
+               declare_per_stage_desc_pointers(ctx, &fninfo,
+                                               ctx->type == PIPE_SHADER_GEOMETRY);
                ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -4822,14 +4756,8 @@ static void create_function(struct si_shader_context *ctx)
                        ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                        ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                        /* Declare as many input SGPRs as the VS has. */
-                       if (!HAVE_32BIT_POINTERS)
-                               add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
                }
 
-               if (!HAVE_32BIT_POINTERS) {
-                       declare_samplers_and_images(ctx, &fninfo,
-                                                   ctx->type == PIPE_SHADER_GEOMETRY);
-               }
                if (ctx->type == PIPE_SHADER_VERTEX) {
                        ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
                                ac_array_in_const32_addr_space(ctx->v4i32));
@@ -4979,6 +4907,13 @@ static void create_function(struct si_shader_context *ctx)
                    shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
                        ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
 
+               unsigned cs_user_data_dwords =
+                       shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_DWORDS];
+               if (cs_user_data_dwords) {
+                       ctx->param_cs_user_data = add_arg(&fninfo, ARG_SGPR,
+                                                         LLVMVectorType(ctx->i32, cs_user_data_dwords));
+               }
+
                for (i = 0; i < 3; i++) {
                        ctx->abi.workgroup_ids[i] = NULL;
                        if (shader->selector->info.uses_block_id[i])
@@ -5307,7 +5242,7 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
               !mainb->rodata_size);
        assert(!epilog || !epilog->rodata_size);
 
-       r600_resource_reference(&shader->bo, NULL);
+       si_resource_reference(&shader->bo, NULL);
        shader->bo = si_aligned_buffer_create(&sscreen->b,
                                              sscreen->cpdma_prefetch_writes_memory ?
                                                0 : SI_RESOURCE_FLAG_READ_ONLY,
@@ -5320,7 +5255,8 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
        /* Upload. */
        ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
                                        PIPE_TRANSFER_READ_WRITE |
-                                       PIPE_TRANSFER_UNSYNCHRONIZED);
+                                       PIPE_TRANSFER_UNSYNCHRONIZED |
+                                       RADEON_TRANSFER_TEMPORARY);
 
        /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
         * endian-independent. */
@@ -5855,6 +5791,8 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
        if (r != 0) {
                FREE(shader);
                shader = NULL;
+       } else {
+               si_fix_resource_usage(sscreen, shader);
        }
        return shader;
 }
@@ -6170,7 +6108,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
        if (sel->force_correct_derivs_after_kill) {
                ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
                /* true = don't kill. */
-               LLVMBuildStore(ctx->ac.builder, LLVMConstInt(ctx->i1, 1, 0),
+               LLVMBuildStore(ctx->ac.builder, ctx->i1true,
                               ctx->postponed_kill);
        }
 
@@ -6580,7 +6518,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
        si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
                           si_get_max_workgroup_size(ctx->shader));
 
-       if (is_merged_shader(ctx->shader))
+       if (is_merged_shader(ctx))
                ac_init_exec_full_mask(&ctx->ac);
 
        /* Record the arguments of the function as if they were an output of
@@ -6638,7 +6576,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
 
                /* Merged shaders are executed conditionally depending
                 * on the number of enabled threads passed in the input SGPRs. */
-               if (is_merged_shader(ctx->shader) && part == 0) {
+               if (is_merged_shader(ctx) && part == 0) {
                        LLVMValueRef ena, count = initial[3];
 
                        count = LLVMBuildAnd(builder, count,
@@ -6682,7 +6620,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
                        if (LLVMTypeOf(arg) != param_type) {
                                if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
                                        if (LLVMGetPointerAddressSpace(param_type) ==
-                                           AC_CONST_32BIT_ADDR_SPACE) {
+                                           AC_ADDR_SPACE_CONST_32BIT) {
                                                arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
                                                arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
                                        } else {
@@ -6700,7 +6638,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
 
                ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
 
-               if (is_merged_shader(ctx->shader) &&
+               if (is_merged_shader(ctx) &&
                    part + 1 == next_shader_first_part) {
                        lp_build_endif(&if_state);
 
@@ -6994,7 +6932,8 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 
        /* Compile to bytecode. */
        r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
-                           ctx.ac.module, debug, ctx.type, "TGSI shader",
+                           ctx.ac.module, debug, ctx.type,
+                           si_get_shader_name(shader, ctx.type),
                            si_should_optimize_less(compiler, shader->selector));
        si_llvm_dispose(&ctx);
        if (r) {
@@ -7034,7 +6973,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
        }
 
        /* Add the scratch offset to input SGPRs. */
-       if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
+       if (shader->config.scratch_bytes_per_wave && !is_merged_shader(&ctx))
                shader->info.num_input_sgprs += 1; /* scratch byte offset */
 
        /* Calculate the number of fragment input VGPRs. */
@@ -7180,26 +7119,11 @@ out:
 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
 {
        LLVMValueRef ptr[2], list;
-       bool is_merged_shader =
-               ctx->screen->info.chip_class >= GFX9 &&
-               (ctx->type == PIPE_SHADER_TESS_CTRL ||
-                ctx->type == PIPE_SHADER_GEOMETRY ||
-                ctx->shader->key.as_ls || ctx->shader->key.as_es);
-
-       if (HAVE_32BIT_POINTERS) {
-               ptr[0] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
-               list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
-                                        ac_array_in_const32_addr_space(ctx->v4i32), "");
-               return list;
-       }
-
-       /* Get the pointer to rw buffers. */
-       ptr[0] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
-       ptr[1] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS + 1);
-       list = ac_build_gather_values(&ctx->ac, ptr, 2);
-       list = LLVMBuildBitCast(ctx->ac.builder, list, ctx->i64, "");
-       list = LLVMBuildIntToPtr(ctx->ac.builder, list,
-                                ac_array_in_const_addr_space(ctx->v4i32), "");
+       bool merged_shader = is_merged_shader(ctx);
+
+       ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
+       list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
+                                ac_array_in_const32_addr_space(ctx->v4i32), "");
        return list;
 }
 
@@ -7318,22 +7242,32 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                        key->vs_prolog.states.instance_divisor_is_one & (1u << i);
                bool divisor_is_fetched =
                        key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
-               LLVMValueRef index;
-
-               if (divisor_is_one || divisor_is_fetched) {
-                       LLVMValueRef divisor = ctx->i32_1;
-
-                       if (divisor_is_fetched) {
-                               divisor = buffer_load_const(ctx, instance_divisor_constbuf,
-                                                           LLVMConstInt(ctx->i32, i * 4, 0));
-                               divisor = ac_to_integer(&ctx->ac, divisor);
+               LLVMValueRef index = NULL;
+
+               if (divisor_is_one) {
+                       index = ctx->abi.instance_id;
+               } else if (divisor_is_fetched) {
+                       LLVMValueRef udiv_factors[4];
+
+                       for (unsigned j = 0; j < 4; j++) {
+                               udiv_factors[j] =
+                                       buffer_load_const(ctx, instance_divisor_constbuf,
+                                                         LLVMConstInt(ctx->i32, i*16 + j*4, 0));
+                               udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
                        }
+                       /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
+                        * Such InstanceID might not be achievable in a reasonable time though.
+                        */
+                       index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
+                                                      udiv_factors[0], udiv_factors[1],
+                                                      udiv_factors[2], udiv_factors[3]);
+               }
 
-                       /* InstanceID / Divisor + StartInstance */
-                       index = get_instance_index_for_fetch(ctx,
-                                                            user_sgpr_base +
-                                                            SI_SGPR_START_INSTANCE,
-                                                            divisor);
+               if (divisor_is_one || divisor_is_fetched) {
+                       /* Add StartInstance. */
+                       index = LLVMBuildAdd(ctx->ac.builder, index,
+                                            LLVMGetParam(ctx->main_fn, user_sgpr_base +
+                                                         SI_SGPR_START_INSTANCE), "");
                } else {
                        /* VertexID + BaseVertex */
                        index = LLVMBuildAdd(ctx->ac.builder,
@@ -7417,8 +7351,6 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
                add_arg(&fninfo, ARG_SGPR, ctx->i32);
                add_arg(&fninfo, ARG_SGPR, ctx->i32);
                add_arg(&fninfo, ARG_SGPR, ctx->i32);
-               if (!HAVE_32BIT_POINTERS)
-                       add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
                ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                add_arg(&fninfo, ARG_SGPR, ctx->i32);
                ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -8186,9 +8118,9 @@ int si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compile
 void si_shader_destroy(struct si_shader *shader)
 {
        if (shader->scratch_bo)
-               r600_resource_reference(&shader->scratch_bo, NULL);
+               si_resource_reference(&shader->scratch_bo, NULL);
 
-       r600_resource_reference(&shader->bo, NULL);
+       si_resource_reference(&shader->bo, NULL);
 
        if (!shader->is_binary_shared)
                ac_shader_binary_clean(&shader->binary);