X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_shader.c;h=a2ed899b58f4a51652f1bf586a6c0c02195cd230;hb=501ff90a954f5a3b9fee1449ec96fbc9bd620f55;hp=551671f40213a5319388d932bcfa998012640b30;hpb=297fb213b3cbb355cdee4c7c0f6f4feab917bc87;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 551671f4021..a2ed899b58f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -86,6 +86,8 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, union si_shader_part_key *key); static void si_build_ps_epilog_function(struct si_shader_context *ctx, union si_shader_part_key *key); +static void si_fix_resource_usage(struct si_screen *sscreen, + struct si_shader *shader); /* Ideally pass the sample mask input to the PS epilog as v14, which * is its usual location, so that the shader doesn't have to add v_mov. @@ -101,15 +103,15 @@ static bool llvm_type_is_64bit(struct si_shader_context *ctx, return false; } -static bool is_merged_shader(struct si_shader *shader) +static bool is_merged_shader(struct si_shader_context *ctx) { - if (shader->selector->screen->info.chip_class <= VI) + if (ctx->screen->info.chip_class <= VI) return false; - return shader->key.as_ls || - shader->key.as_es || - shader->selector->type == PIPE_SHADER_TESS_CTRL || - shader->selector->type == PIPE_SHADER_GEOMETRY; + return ctx->shader->key.as_ls || + ctx->shader->key.as_es || + ctx->type == PIPE_SHADER_TESS_CTRL || + ctx->type == PIPE_SHADER_GEOMETRY; } static void si_init_function_info(struct si_function_info *fninfo) @@ -378,10 +380,7 @@ get_tcs_out_current_patch_offset(struct si_shader_context *ctx) LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - return LLVMBuildAdd(ctx->ac.builder, patch0_offset, - LLVMBuildMul(ctx->ac.builder, patch_stride, - rel_patch_id, ""), - ""); + return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset); } static LLVMValueRef @@ -392,10 +391,7 @@ get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - return LLVMBuildAdd(ctx->ac.builder, patch0_patch_data_offset, - LLVMBuildMul(ctx->ac.builder, patch_stride, - rel_patch_id, ""), - ""); + return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset); } static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx) @@ -434,20 +430,6 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) } } -static LLVMValueRef get_instance_index_for_fetch( - struct si_shader_context *ctx, - unsigned param_start_instance, LLVMValueRef divisor) -{ - LLVMValueRef result = ctx->abi.instance_id; - - /* The division must be done before START_INSTANCE is added. */ - if (divisor != ctx->i32_1) - result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, ""); - - return LLVMBuildAdd(ctx->ac.builder, result, - LLVMGetParam(ctx->main_fn, param_start_instance), ""); -} - /* Bitcast <4 x float> to <2 x double>, extract the component, and convert * to float. */ static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx, @@ -581,6 +563,14 @@ void si_llvm_load_input_vs( /* Do multiple loads for special formats. */ switch (fix_fetch) { + case SI_FIX_FETCH_RG_64_FLOAT: + num_fetches = 1; /* 1 2-dword or 4-dword load */ + fetch_stride = 0; + if (util_last_bit(info->input_usage_mask[input_index]) >= 2) + num_channels = 4; /* 2 doubles in 4 dwords */ + else + num_channels = 2; /* 1 double in 2 dwords */ + break; case SI_FIX_FETCH_RGB_64_FLOAT: num_fetches = 3; /* 3 2-dword loads */ fetch_stride = 8; @@ -815,12 +805,8 @@ LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx, result = ac_to_integer(&ctx->ac, result); } - if (addr_mul != 1) - result = LLVMBuildMul(ctx->ac.builder, result, - LLVMConstInt(ctx->i32, addr_mul, 0), ""); - result = LLVMBuildAdd(ctx->ac.builder, result, - LLVMConstInt(ctx->i32, rel_index, 0), ""); - return result; + return ac_build_imad(&ctx->ac, result, LLVMConstInt(ctx->i32, addr_mul, 0), + LLVMConstInt(ctx->i32, rel_index, 0)); } /** @@ -847,15 +833,13 @@ static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context bool is_patch) { if (vertex_dw_stride) { - base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, - LLVMBuildMul(ctx->ac.builder, vertex_index, - vertex_dw_stride, ""), ""); + base_addr = ac_build_imad(&ctx->ac, vertex_index, + vertex_dw_stride, base_addr); } if (param_index) { - base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, - LLVMBuildMul(ctx->ac.builder, param_index, - LLVMConstInt(ctx->i32, 4, 0), ""), ""); + base_addr = ac_build_imad(&ctx->ac, param_index, + LLVMConstInt(ctx->i32, 4, 0), base_addr); } int param = is_patch ? @@ -975,22 +959,15 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, constant16 = LLVMConstInt(ctx->i32, 16, 0); if (vertex_index) { - base_addr = LLVMBuildMul(ctx->ac.builder, rel_patch_id, - vertices_per_patch, ""); - - base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, - vertex_index, ""); - + base_addr = ac_build_imad(&ctx->ac, rel_patch_id, + vertices_per_patch, vertex_index); param_stride = total_vertices; } else { base_addr = rel_patch_id; param_stride = num_patches; } - base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, - LLVMBuildMul(ctx->ac.builder, param_index, - param_stride, ""), ""); - + base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr); base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, ""); if (!vertex_index) { @@ -1223,11 +1200,11 @@ static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, static LLVMValueRef fetch_input_tcs( struct lp_build_tgsi_context *bld_base, const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, unsigned swizzle) + enum tgsi_opcode_type type, unsigned swizzle_in) { struct si_shader_context *ctx = si_shader_context(bld_base); LLVMValueRef dw_addr, stride; - + unsigned swizzle = swizzle_in & 0xffff; stride = get_tcs_in_vertex_dw_stride(ctx); dw_addr = get_tcs_in_current_patch_offset(ctx); dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); @@ -1308,10 +1285,11 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, static LLVMValueRef fetch_output_tcs( struct lp_build_tgsi_context *bld_base, const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, unsigned swizzle) + enum tgsi_opcode_type type, unsigned swizzle_in) { struct si_shader_context *ctx = si_shader_context(bld_base); LLVMValueRef dw_addr, stride; + unsigned swizzle = (swizzle_in & 0xffff); if (reg->Register.Dimension) { stride = get_tcs_out_vertex_dw_stride(ctx); @@ -1328,10 +1306,11 @@ static LLVMValueRef fetch_output_tcs( static LLVMValueRef fetch_input_tes( struct lp_build_tgsi_context *bld_base, const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, unsigned swizzle) + enum tgsi_opcode_type type, unsigned swizzle_in) { struct si_shader_context *ctx = si_shader_context(bld_base); LLVMValueRef base, addr; + unsigned swizzle = (swizzle_in & 0xffff); base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg); @@ -1715,10 +1694,11 @@ static LLVMValueRef fetch_input_gs( struct lp_build_tgsi_context *bld_base, const struct tgsi_full_src_register *reg, enum tgsi_opcode_type type, - unsigned swizzle) + unsigned swizzle_in) { struct si_shader_context *ctx = si_shader_context(bld_base); struct tgsi_shader_info *info = &ctx->shader->selector->info; + unsigned swizzle = swizzle_in & 0xffff; unsigned semantic_name = info->input_semantic_name[reg->Register.Index]; if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) @@ -2286,6 +2266,10 @@ void si_load_system_value(struct si_shader_context *ctx, break; } + case TGSI_SEMANTIC_CS_USER_DATA: + value = LLVMGetParam(ctx->main_fn, ctx->param_cs_user_data); + break; + default: assert(!"unknown system value"); return; @@ -2299,7 +2283,7 @@ void si_declare_compute_memory(struct si_shader_context *ctx) struct si_shader_selector *sel = ctx->shader->selector; unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]; - LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_LOCAL_ADDR_SPACE); + LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS); LLVMValueRef var; assert(!ctx->ac.lds); @@ -2307,7 +2291,7 @@ void si_declare_compute_memory(struct si_shader_context *ctx) var = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->i8, lds_size), "compute_lds", - AC_LOCAL_ADDR_SPACE); + AC_ADDR_SPACE_LDS); LLVMSetAlignment(var, 4); ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); @@ -2336,18 +2320,9 @@ static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *c ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, ""); LLVMValueRef desc0, desc1; - if (HAVE_32BIT_POINTERS) { - desc0 = ptr; - desc1 = LLVMConstInt(ctx->i32, - S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); - } else { - ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, ""); - desc0 = LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""); - desc1 = LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""); - /* Mask out all bits except BASE_ADDRESS_HI. */ - desc1 = LLVMBuildAnd(ctx->ac.builder, desc1, - LLVMConstInt(ctx->i32, ~C_008F04_BASE_ADDRESS_HI, 0), ""); - } + desc0 = ptr; + desc1 = LLVMConstInt(ctx->i32, + S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); LLVMValueRef desc_elems[] = { desc0, @@ -2412,16 +2387,17 @@ static LLVMValueRef fetch_constant( struct lp_build_tgsi_context *bld_base, const struct tgsi_full_src_register *reg, enum tgsi_opcode_type type, - unsigned swizzle) + unsigned swizzle_in) { struct si_shader_context *ctx = si_shader_context(bld_base); struct si_shader_selector *sel = ctx->shader->selector; const struct tgsi_ind_register *ireg = ®->Indirect; unsigned buf, idx; + unsigned swizzle = swizzle_in & 0xffff; LLVMValueRef addr, bufp; - if (swizzle == LP_CHAN_ALL) { + if (swizzle_in == LP_CHAN_ALL) { unsigned chan; LLVMValueRef values[4]; for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) @@ -2435,7 +2411,7 @@ static LLVMValueRef fetch_constant( LLVMValueRef lo, hi; lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle); - hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle + 1); + hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, (swizzle_in >> 16)); return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), lo, hi); } @@ -2620,7 +2596,7 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base, LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, ""); ac_build_kill_if_false(&ctx->ac, alpha_pass); } else { - ac_build_kill_if_false(&ctx->ac, LLVMConstInt(ctx->i1, 0, 0)); + ac_build_kill_if_false(&ctx->ac, ctx->i1false); } } @@ -2678,10 +2654,8 @@ static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, const_chan) * 4, 0); base_elt = buffer_load_const(ctx, const_resource, addr); - args->out[chan] = - LLVMBuildFAdd(ctx->ac.builder, args->out[chan], - LLVMBuildFMul(ctx->ac.builder, base_elt, - out_elts[const_chan], ""), ""); + args->out[chan] = ac_build_fmad(&ctx->ac, base_elt, + out_elts[const_chan], args->out[chan]); } } @@ -2745,12 +2719,10 @@ static void emit_streamout_output(struct si_shader_context *ctx, break; case 2: /* as v2i32 */ case 3: /* as v4i32 (aligned to 4) */ + out[3] = LLVMGetUndef(ctx->i32); + /* fall through */ case 4: /* as v4i32 */ - vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps))); - for (int j = 0; j < num_comps; j++) { - vdata = LLVMBuildInsertElement(ctx->ac.builder, vdata, out[j], - LLVMConstInt(ctx->i32, j, 0), ""); - } + vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps)); break; } @@ -2823,9 +2795,9 @@ static void si_llvm_emit_streamout(struct si_shader_context *ctx, ctx->param_streamout_offset[i]); so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), ""); - so_write_offset[i] = LLVMBuildMul(builder, so_write_index, - LLVMConstInt(ctx->i32, so->stride[i]*4, 0), ""); - so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, ""); + so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index, + LLVMConstInt(ctx->i32, so->stride[i]*4, 0), + so_offset); } /* Write streamout data. */ @@ -3058,7 +3030,7 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); LLVMValueRef invocation_id, buffer, buffer_offset; - LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base; + LLVMValueRef lds_vertex_stride, lds_base; uint64_t inputs; invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); @@ -3066,10 +3038,9 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx); - lds_vertex_offset = LLVMBuildMul(ctx->ac.builder, invocation_id, - lds_vertex_stride, ""); lds_base = get_tcs_in_current_patch_offset(ctx); - lds_base = LLVMBuildAdd(ctx->ac.builder, lds_base, lds_vertex_offset, ""); + lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, + lds_base); inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy; while (inputs) { @@ -3293,21 +3264,9 @@ si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret, unsigned param, unsigned return_index) { LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef ptr, lo, hi; - - if (HAVE_32BIT_POINTERS) { - ptr = LLVMGetParam(ctx->main_fn, param); - ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, ""); - return LLVMBuildInsertValue(builder, ret, ptr, return_index, ""); - } - - ptr = LLVMGetParam(ctx->main_fn, param); - ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, ""); - ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, ""); - lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, ""); - hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, ""); - ret = LLVMBuildInsertValue(builder, ret, lo, return_index, ""); - return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, ""); + LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, param); + ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, ""); + return LLVMBuildInsertValue(builder, ret, ptr, return_index, ""); } /* This only writes the tessellation factor levels. */ @@ -3408,8 +3367,7 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx) LLVMValueRef ret = ctx->return_value; ret = si_insert_input_ptr(ctx, ret, 0, 0); - if (HAVE_32BIT_POINTERS) - ret = si_insert_input_ptr(ctx, ret, 1, 1); + ret = si_insert_input_ptr(ctx, ret, 1, 1); ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2); ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3); ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4); @@ -3424,11 +3382,6 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx) ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); -#if !HAVE_32BIT_POINTERS - ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 4, - 8 + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES); -#endif - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets, @@ -3452,8 +3405,7 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) LLVMValueRef ret = ctx->return_value; ret = si_insert_input_ptr(ctx, ret, 0, 0); - if (HAVE_32BIT_POINTERS) - ret = si_insert_input_ptr(ctx, ret, 1, 1); + ret = si_insert_input_ptr(ctx, ret, 1, 1); ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2); ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3); ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5); @@ -3464,11 +3416,6 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) ctx->param_bindless_samplers_and_images, 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); -#if !HAVE_32BIT_POINTERS - ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 4, - 8 + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES); -#endif - unsigned vgpr; if (ctx->type == PIPE_SHADER_VERTEX) vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR; @@ -4134,17 +4081,12 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, ddxy_out, iy_ll, ""); LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ix_ll, ""); - LLVMValueRef temp1, temp2; + LLVMValueRef temp; interp_el = ac_to_float(&ctx->ac, interp_el); - temp1 = LLVMBuildFMul(ctx->ac.builder, ddx_el, offset_x, ""); - - temp1 = LLVMBuildFAdd(ctx->ac.builder, temp1, interp_el, ""); - - temp2 = LLVMBuildFMul(ctx->ac.builder, ddy_el, offset_y, ""); - - ij_out[i] = LLVMBuildFAdd(ctx->ac.builder, temp2, temp1, ""); + temp = ac_build_fmad(&ctx->ac, ddx_el, offset_x, interp_el); + ij_out[i] = ac_build_fmad(&ctx->ac, ddy_el, offset_y, temp); } interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2); } @@ -4347,9 +4289,12 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi, gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, ""); LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); - /* Signal vertex emission */ - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), - si_get_gs_wave_id(ctx)); + /* Signal vertex emission if vertex data was written. */ + if (offset) { + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), + si_get_gs_wave_id(ctx)); + } + if (!use_kill) lp_build_endif(&if_state); } @@ -4404,9 +4349,7 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, return; } - ac_build_intrinsic(&ctx->ac, - "llvm.amdgcn.s.barrier", - ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT); + ac_build_s_barrier(&ctx->ac); } static void si_create_function(struct si_shader_context *ctx, @@ -4618,6 +4561,30 @@ static void declare_vs_input_vgprs(struct si_shader_context *ctx, } } +static void declare_vs_blit_inputs(struct si_shader_context *ctx, + struct si_function_info *fninfo, + unsigned vs_blit_property) +{ + ctx->param_vs_blit_inputs = fninfo->num_params; + add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */ + add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */ + add_arg(fninfo, ARG_SGPR, ctx->f32); /* depth */ + + if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { + add_arg(fninfo, ARG_SGPR, ctx->f32); /* color0 */ + add_arg(fninfo, ARG_SGPR, ctx->f32); /* color1 */ + add_arg(fninfo, ARG_SGPR, ctx->f32); /* color2 */ + add_arg(fninfo, ARG_SGPR, ctx->f32); /* color3 */ + } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { + add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */ + add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */ + add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */ + add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */ + add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */ + add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */ + } +} + static void declare_tes_input_vgprs(struct si_shader_context *ctx, struct si_function_info *fninfo) { @@ -4662,24 +4629,7 @@ static void create_function(struct si_shader_context *ctx) declare_global_desc_pointers(ctx, &fninfo); if (vs_blit_property) { - ctx->param_vs_blit_inputs = fninfo.num_params; - add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */ - add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */ - add_arg(&fninfo, ARG_SGPR, ctx->f32); /* depth */ - - if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { - add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color0 */ - add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color1 */ - add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color2 */ - add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color3 */ - } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { - add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */ - add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */ - add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */ - add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */ - add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */ - add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */ - } + declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property); /* VGPRs */ declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); @@ -4736,13 +4686,8 @@ static void create_function(struct si_shader_context *ctx) case SI_SHADER_MERGED_VERTEX_TESSCTRL: /* Merged stages have 8 system SGPRs at the beginning. */ /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */ - if (HAVE_32BIT_POINTERS) { - declare_per_stage_desc_pointers(ctx, &fninfo, - ctx->type == PIPE_SHADER_TESS_CTRL); - } else { - declare_const_and_shader_buffers(ctx, &fninfo, - ctx->type == PIPE_SHADER_TESS_CTRL); - } + declare_per_stage_desc_pointers(ctx, &fninfo, + ctx->type == PIPE_SHADER_TESS_CTRL); ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32); ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); @@ -4755,15 +4700,9 @@ static void create_function(struct si_shader_context *ctx) ctx->type == PIPE_SHADER_VERTEX); declare_vs_specific_input_sgprs(ctx, &fninfo); - if (!HAVE_32BIT_POINTERS) { - declare_samplers_and_images(ctx, &fninfo, - ctx->type == PIPE_SHADER_TESS_CTRL); - } ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32); ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); - if (!HAVE_32BIT_POINTERS) /* Align to 2 dwords. */ - add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32)); @@ -4797,13 +4736,8 @@ static void create_function(struct si_shader_context *ctx) case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY: /* Merged stages have 8 system SGPRs at the beginning. */ /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */ - if (HAVE_32BIT_POINTERS) { - declare_per_stage_desc_pointers(ctx, &fninfo, - ctx->type == PIPE_SHADER_GEOMETRY); - } else { - declare_const_and_shader_buffers(ctx, &fninfo, - ctx->type == PIPE_SHADER_GEOMETRY); - } + declare_per_stage_desc_pointers(ctx, &fninfo, + ctx->type == PIPE_SHADER_GEOMETRY); ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32); ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); @@ -4822,14 +4756,8 @@ static void create_function(struct si_shader_context *ctx) ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* Declare as many input SGPRs as the VS has. */ - if (!HAVE_32BIT_POINTERS) - add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ } - if (!HAVE_32BIT_POINTERS) { - declare_samplers_and_images(ctx, &fninfo, - ctx->type == PIPE_SHADER_GEOMETRY); - } if (ctx->type == PIPE_SHADER_VERTEX) { ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32)); @@ -4979,6 +4907,13 @@ static void create_function(struct si_shader_context *ctx) shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0) ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32); + unsigned cs_user_data_dwords = + shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_DWORDS]; + if (cs_user_data_dwords) { + ctx->param_cs_user_data = add_arg(&fninfo, ARG_SGPR, + LLVMVectorType(ctx->i32, cs_user_data_dwords)); + } + for (i = 0; i < 3; i++) { ctx->abi.workgroup_ids[i] = NULL; if (shader->selector->info.uses_block_id[i]) @@ -5307,7 +5242,7 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) !mainb->rodata_size); assert(!epilog || !epilog->rodata_size); - r600_resource_reference(&shader->bo, NULL); + si_resource_reference(&shader->bo, NULL); shader->bo = si_aligned_buffer_create(&sscreen->b, sscreen->cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY, @@ -5320,7 +5255,8 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) /* Upload. */ ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL, PIPE_TRANSFER_READ_WRITE | - PIPE_TRANSFER_UNSYNCHRONIZED); + PIPE_TRANSFER_UNSYNCHRONIZED | + RADEON_TRANSFER_TEMPORARY); /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are * endian-independent. */ @@ -5855,6 +5791,8 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, if (r != 0) { FREE(shader); shader = NULL; + } else { + si_fix_resource_usage(sscreen, shader); } return shader; } @@ -6170,7 +6108,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx) if (sel->force_correct_derivs_after_kill) { ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, ""); /* true = don't kill. */ - LLVMBuildStore(ctx->ac.builder, LLVMConstInt(ctx->i1, 1, 0), + LLVMBuildStore(ctx->ac.builder, ctx->i1true, ctx->postponed_kill); } @@ -6580,7 +6518,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, si_create_function(ctx, "wrapper", NULL, 0, &fninfo, si_get_max_workgroup_size(ctx->shader)); - if (is_merged_shader(ctx->shader)) + if (is_merged_shader(ctx)) ac_init_exec_full_mask(&ctx->ac); /* Record the arguments of the function as if they were an output of @@ -6638,7 +6576,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, /* Merged shaders are executed conditionally depending * on the number of enabled threads passed in the input SGPRs. */ - if (is_merged_shader(ctx->shader) && part == 0) { + if (is_merged_shader(ctx) && part == 0) { LLVMValueRef ena, count = initial[3]; count = LLVMBuildAnd(builder, count, @@ -6682,7 +6620,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, if (LLVMTypeOf(arg) != param_type) { if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { if (LLVMGetPointerAddressSpace(param_type) == - AC_CONST_32BIT_ADDR_SPACE) { + AC_ADDR_SPACE_CONST_32BIT) { arg = LLVMBuildBitCast(builder, arg, ctx->i32, ""); arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); } else { @@ -6700,7 +6638,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, ret = LLVMBuildCall(builder, parts[part], in, num_params, ""); - if (is_merged_shader(ctx->shader) && + if (is_merged_shader(ctx) && part + 1 == next_shader_first_part) { lp_build_endif(&if_state); @@ -6994,7 +6932,8 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, /* Compile to bytecode. */ r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, - ctx.ac.module, debug, ctx.type, "TGSI shader", + ctx.ac.module, debug, ctx.type, + si_get_shader_name(shader, ctx.type), si_should_optimize_less(compiler, shader->selector)); si_llvm_dispose(&ctx); if (r) { @@ -7034,7 +6973,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, } /* Add the scratch offset to input SGPRs. */ - if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader)) + if (shader->config.scratch_bytes_per_wave && !is_merged_shader(&ctx)) shader->info.num_input_sgprs += 1; /* scratch byte offset */ /* Calculate the number of fragment input VGPRs. */ @@ -7180,26 +7119,11 @@ out: static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx) { LLVMValueRef ptr[2], list; - bool is_merged_shader = - ctx->screen->info.chip_class >= GFX9 && - (ctx->type == PIPE_SHADER_TESS_CTRL || - ctx->type == PIPE_SHADER_GEOMETRY || - ctx->shader->key.as_ls || ctx->shader->key.as_es); - - if (HAVE_32BIT_POINTERS) { - ptr[0] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); - list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], - ac_array_in_const32_addr_space(ctx->v4i32), ""); - return list; - } - - /* Get the pointer to rw buffers. */ - ptr[0] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); - ptr[1] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS + 1); - list = ac_build_gather_values(&ctx->ac, ptr, 2); - list = LLVMBuildBitCast(ctx->ac.builder, list, ctx->i64, ""); - list = LLVMBuildIntToPtr(ctx->ac.builder, list, - ac_array_in_const_addr_space(ctx->v4i32), ""); + bool merged_shader = is_merged_shader(ctx); + + ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); + list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], + ac_array_in_const32_addr_space(ctx->v4i32), ""); return list; } @@ -7318,22 +7242,32 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx, key->vs_prolog.states.instance_divisor_is_one & (1u << i); bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); - LLVMValueRef index; - - if (divisor_is_one || divisor_is_fetched) { - LLVMValueRef divisor = ctx->i32_1; - - if (divisor_is_fetched) { - divisor = buffer_load_const(ctx, instance_divisor_constbuf, - LLVMConstInt(ctx->i32, i * 4, 0)); - divisor = ac_to_integer(&ctx->ac, divisor); + LLVMValueRef index = NULL; + + if (divisor_is_one) { + index = ctx->abi.instance_id; + } else if (divisor_is_fetched) { + LLVMValueRef udiv_factors[4]; + + for (unsigned j = 0; j < 4; j++) { + udiv_factors[j] = + buffer_load_const(ctx, instance_divisor_constbuf, + LLVMConstInt(ctx->i32, i*16 + j*4, 0)); + udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); } + /* The faster NUW version doesn't work when InstanceID == UINT_MAX. + * Such InstanceID might not be achievable in a reasonable time though. + */ + index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, + udiv_factors[0], udiv_factors[1], + udiv_factors[2], udiv_factors[3]); + } - /* InstanceID / Divisor + StartInstance */ - index = get_instance_index_for_fetch(ctx, - user_sgpr_base + - SI_SGPR_START_INSTANCE, - divisor); + if (divisor_is_one || divisor_is_fetched) { + /* Add StartInstance. */ + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMGetParam(ctx->main_fn, user_sgpr_base + + SI_SGPR_START_INSTANCE), ""); } else { /* VertexID + BaseVertex */ index = LLVMBuildAdd(ctx->ac.builder, @@ -7417,8 +7351,6 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx, add_arg(&fninfo, ARG_SGPR, ctx->i32); add_arg(&fninfo, ARG_SGPR, ctx->i32); add_arg(&fninfo, ARG_SGPR, ctx->i32); - if (!HAVE_32BIT_POINTERS) - add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); add_arg(&fninfo, ARG_SGPR, ctx->i32); ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); @@ -8186,9 +8118,9 @@ int si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compile void si_shader_destroy(struct si_shader *shader) { if (shader->scratch_bo) - r600_resource_reference(&shader->scratch_bo, NULL); + si_resource_reference(&shader->scratch_bo, NULL); - r600_resource_reference(&shader->bo, NULL); + si_resource_reference(&shader->bo, NULL); if (!shader->is_binary_shared) ac_shader_binary_clean(&shader->binary);